Load
Copy
package com.owl.common.options;
import org.apache.commons.lang3.StringUtils;
import java.util.Properties;
/**
* Collibra DQ Options related to data loading
*/
public class LoadOpt {
// Options order: "unsorted",
// "dataset scope columns", "dataset scope rows", "look back",
// "common options for both data sources", "file as data source", "db as data source"
public static final String SINGLE_QUOTE = "'";
public static final String DOUBLE_QUOTE = "\"";
public static final String BACK_TICK = "`";
/**
* If true, don't save any metadata
* TODO confirm if this is correct
*/
public Boolean readonly = false;
/**
* The Password manager.
*/
public String passwordManager = null;
/**
* Catalog alias (Catalog name)
*/
public String alias = StringUtils.EMPTY;
// --- Dataset Scope Column specifications ------- //
// Properties that select columns for Dataset activities or modifies (data type or new columns)
// prior and/or during loading into Spark DF
/**
* Dataset scope query. (IMPORTANT)
* The query should contain all the columns necessary to run the activities.
* TODO: figure out if this gets used when using files
*/
public String query = StringUtils.EMPTY;
/**
* Concatenated column names (sep = ",") for columns that are keys
* TODO: confirm
*/
public String key = StringUtils.EMPTY;
/**
* SELECT expression to transform expressions with assignment by "=" and delimited by "|".
* e.g. colname=cast(colname as string)|colname2=colname2(cast as date)
*/
public String expression = StringUtils.EMPTY;
/**
* Add "OWL_RUN_ID" UNIX timestamp (s) column to Spark DF usng the OwlOptions.runId.
* Does not obey timeStampDivisor (timestamp in seconds because Spark)
*/
public Boolean addDateColumn = false;
/**
* Fill null values in Spark DF with 0 (numeric columns only)
*/
public Boolean zeroFillNull = false;
/**
* A string that indicates a null value; any value matching this string will be set as nulls in the Spark DF
* Default: "" -> NULL
* Example: 'null' -> NULL
* --
* Note: to emptyStirngFillNull (replace String column null -> "", use expression
*/
public String replaceNulls = StringUtils.EMPTY;
/**
* All data types forced to strings for type safe processing.
* Not implemented in activity (yet)
*/
public Boolean stringMode = false;
// --- Dataset Scope Row specifications ------- //
// Properties that filter rows for Dataset activities
// prior and/or during loading into Spark DF
/**
* Convert row into string and only use rows containing this value.
* Strict matching only.
*/
public String filter = StringUtils.EMPTY;
/**
* Convert row into string and only use rows containing this value.
* Strict matching only.
*/
public String filterNot = StringUtils.EMPTY;
// --- Look back ------- //
// For Look back feature
/**
* Build up history of DQ Checks. Does not include current DQ Check.
* TODO: Document the relationship with unionLookBack
*/
public Integer backRun = null;
/**
* Whether to load data for looking back in history.
* How much historical data to load is based on OutlierOpt.lookback and PatternOpt.lookback.
*/
public Boolean unionLookBack = false;
// --- Shared Data Loading Options ------- //
// Properties that affect data loading & pre-processing for both files and db as source
/**
* Whether to use cached data for activities
*/
public Boolean cache = true;
/**
* The year, month, and day format of date columns in the dataset for loading the data only.
* Default = "yyyy-MM-dd"
*/
public String dateFormat = "yyyy-MM-dd";
/**
* The hour, minute, second, and milisecond format of date columns in the dataset for loading the data only/
* Default = "HH:mm:ss.SSS"
* Not used. Questionably why separate timeFormat variable exists when dateFromat can represent hms as well.
*/
public String timeFormat = "HH:mm:ss.SSS";
/**
* Whether to convert date columns (specified by activity opts) in dataset
* into timestamp in ms (to make it seconds, set Props.timeStampDivisor = "s")
* TODO: Needs LoadOpt.timeStampDivisor and fix Utils.scala date2Timestamp
*/
public Boolean timestamp = false;
/* TODO add timeStampDivisor here and map between owl props?
public String timeStampDivisor = "ms"
*/
// --- Using file as data source ------- //
// Properties that control where & how static file is read
/**
* Full path to the file.
* If hdfs, then "hdfs://...".
* If s3, then "s3://...", "s3a://...", or "s3n://...".
* If parquet, then "...parquet" or "...PARQUET"
*/
public String filePath = StringUtils.EMPTY;
/**
* SQL query used on file.
* owl_id is added if not included in select clause.
* If empty, then defaults to full file query.
* (Does not update LoadOpts.fullFile to true).
*/
public String fileQuery = StringUtils.EMPTY;
/**
* Whether to use full file (i.e. use all columns) on data load
*/
public Boolean fullFile = false;
/**
* File column names, comma separated
*/
public String fileHeader = null;
/* TODO checkHeader needs to be moved here from DupeOpt
public Boolean checkHeader = true;*/
/**
* Whether to have Spark infer the schema of data source
* If props.profile2 == true, this is overwritten to false!
* If xml file, this is ignored and schema is always inferred by Spark on xml data load.
* If avro file, this value is respected (but may get overwritten by props.profile2)
* (see activity2.Load.file)
*/
public Boolean inferSchema = true;
/**
* Sample without replacement from file. Valid value is a fraction [0, 1.0].
* Only affects when filetype is xml or unspecified (and therefore assumed to be delimited table)
*/
public Double sample = 1.0;
/**
* Filetype (avro, json, orc, parquet, xml). Unspecified file
*/
public FileType fileType = null;
/**
* Delimiter for file. If number of characters after replacing "\" with "" is 2 or more character
* (e.g. compound delimiters like \t\t), then defaults to "\t" and attempts to read file as tsv
* See Activity2.load.file for details
*/
public String delimiter = ",";
/**
* File character encoding
*/
public String fileCharSet = "UTF-8";
/**
* The Avro schema for relevant avro file. Ignored if empty string
*/
public String avroSchema = StringUtils.EMPTY;
/**
* The Xml row tag for xml file. Ignored if empty string.
*/
public String xmlRowTag = StringUtils.EMPTY;
/**
* Whether to flatten arrays in nested schema
* TODO explain better. Does this only affect JSON file?
*/
public Boolean flatten = false;
/**
* Whether data contains maps in json that requires extra handling"
* TODO explain better. Does this only affect JSON file?
*/
public Boolean handleMaps = false;
/**
* Whether to handle mixed json.
* TODO explain better. Does this only affect JSON file?
*/
public Boolean handleMixedJson = false;
/**
* Spark.read option multiline, for JSON file only
*/
public Boolean multiLine = false;
// --- Using database as data source ------ //
/**
* Path to DB Driver. (e.g. /opt/owl/driver/postgres)
*/
public String lib = StringUtils.EMPTY;
/**
* DB Driver name (Java namespace, e.g. org.postgresql.Driver).
* Leave as null (default) and LoadOpts.connectionURL will resolve the driver name.
*/
public String driverName = null;
/**
* Connections name in metastore DB (public.connections.aliasname).
* Does not refer to the "name" of the database. Refers to "aliasname" that the user set when
* uploading connection config to Collibra DQ.
*/
public String connectionName = StringUtils.EMPTY;
/**
* The Connection url, prefixed by jdbc.
* e.g. "jdbc:postgresql://localhost:5432"
*/
public String connectionUrl = StringUtils.EMPTY;
/**
* DB username
*/
public String userName = StringUtils.EMPTY;
/**
* DB password
*/
public String password = StringUtils.EMPTY;
/**
* JDBC Connection properties (e.g. fetchsize)
*/
public Properties connectionProperties = null;
/**
* Whether data source is Hive Native (not using JDBC)
* TODO: Why is the default null as opposed to false?
*/
public Boolean hiveNative = null;
/**
* Whether data source is Hive Hadoop Web Cluster (not using JDBC)
*/
public Boolean hiveNativeHWC = false;
// --- Parallel JDBC ------- //
/**
* When running parallel JDBC, use LoadOpts.query and OwlOptions.dataset as base table
*/
public Boolean useSql = true;
/**
* When running parallel JDBC, specify column name
* ?? Activity2.Load and web has hard-coded magic string "OWLAUTOJDBC"
*/
public String columnName = null;
/**
* When running parallel JDBC, the upper bound for partition column.
* (e.g. "1000000")
*/
public String lowerBound = null;
/**
* When running parallel JDBC, the upper bound for partition column.
* (e.g. "5000000")
*/
public String upperBound = null;
/**
* When running parallel JDBC, the number of partitions used.
* If 0, then numPartitions used is based on the number of available Spark Executor (1/2 ~ 2/3)
* If > 20, then overwritten to 20 (no more than 20 concurrent connections to a database on a single dataset)
*/
public Integer numPartitions = 0;
// --- SQL Query properties ---------- //
// TODO: does this effect DB as source or file as source as well?
/**
* Whether the escape character would be back tick (`).
* Ignored if escapeCharacter is non-empty (if using DQ Check from Options).
* Marked as true if props.escapeCharacter is a tick
* (to preserve bijection between props and opts, and vice versa).
*/
public Boolean escapeWithBackTick = false;
/**
* Whether the escape character would be single quote (').
* Ignored if escapeCharacter is non-empty (if using DQ Check from Options).
* Marked as true if props.escapeCharacter is a tick
* (to preserve bijection between props and opts, and vice versa).
*/
public Boolean escapeWithSingleQuote = false;
/**
* Whether the escape character would be double quote (").
* Ignored if escapeCharacter is non-empty(if using DQ Check from Options).
* Marked as true if props.escapeCharacter is a tick
* (to preserve bijection between props and opts, and vice versa).
*/
public Boolean escapeWithDoubleQuote = false;
/**
* Specify custom escape character. This takes precedence over all other escapeWithXYZ options.
* i.e. if non-empty, then other escapeWithXYZ options are ignored.
* If empty (default), no escaping attempt is made (and SQL query may fail if it contains reserved word)
*
* @deprecated Access level of this field will be changed to private. Please use {@link #setEscapeCharacter(String)} instead.
*/
@Deprecated
public String escapeCharacter = StringUtils.EMPTY;
/**
* The enum File type.
*/
public enum FileType {
/**
* Avro file type.
*/
avro,
/**
* Json file type.
*/
json,
/**
* Orc file type.
*/
orc,
/**
* Parquet file type.
*/
parquet,
/**
* Xml file type.
*/
xml
}
}