Dupe

Copy
package com.owl.common.options;

/**
 * Options for Dupe Activity
 */
public class DupeOpt {

    /**
     * Whether to run Dupe Activity
     */
    public Boolean on = false;          // --dupe

    /**
     * @deprecated Unused for Activity2
     */
    public Boolean only = false;        // --dupeonly

    /**
     * Column names to include Dupe Activity
     */
    public String[] include;            // -dupeinc

    /**
     * Column names to exclude Dupe Activity
     */
    public String[] exclude;            // dupeexc

    /**
     * Indicator for complexity. See Activity2.Dupe.Scala.execute()
     * depth == 0 : exact match (sets props.dupeExactMatch = TRUE downstream)
     */
    public Integer depth = 2;           // -depth

    /**
     * The minimum dupe scores between two duplicates. (currently calculated as "edit distance", out of upperBound)
     * Two values with dupe score less than this is lowerBound are not duplicates (i.e. "truly" different values)
     */
    public Integer lowerBound = 80;     // -dupelb, -dupecutoff

    /**
     * The maximum possible dupe score for duplicate records (for a given dupe detection method).
     * Currently assumed to be 100.
     */
    public Integer upperBound = 100;    // -dupeub, -dupepermatchupperlimit

    /**
     * Approximate dupe score used to create block index (when DF is large)
     */
    public Integer approximate = 1;     // -dupeapprox

    /**
     * Number of observations per unique duplicate
     */
    public Integer limitPerDupe = 15;

    /**
     * Whether to process column headers when data load uses manual column names (LoadOpts.fileHeader)
     * TODO this belongs in LoadOpts, not DupeOpts
     */
    public Boolean checkHeader = true;

    /**
     * TODO remove
     *
     * @deprecated not used;
     */
    public String filter;

    /**
     * If true, dupe activity is case insensitive. If false, dupe activity is case sensitive.
     * Convenience feature for upper and lower set to 100
     */
    public Boolean ignoreCase = false;      //-dupenocase

    /**
     * Number of points each duplicate contributes to the total schema score (on the Findings page)
     */
    public Double score = 1.0;               //-dupescore  points per duplicate found default 1

    /**
     * Number of unique duplicates to compute during dupe activity
     */
    public Integer limit = 300;               //-dupelimit  default 300