Custom

Custom reports can be leveraged by connecting your favorite BI tool on the underlying reporting mart. Below are a few queries that can be used as inspiration for building your own reports. Please refer to the ERD diagram for a larger list of tables.

Long Running Jobs

select dataset,run_id,total_time from dataset_activity where total_time is not null order by total_time desc

Jobs Submitted

select * from owlcheck_q

Jobs by User

select count(*) as owlchecks, username from owlcheck_q where updt_ts < now() group by username order by owlchecks desc

Jobs by User, Dataset

select count(*), user_nm, dataset from dev.public.owl_check_history group by user_nm, dataset order by count desc

Largest by Row Count

select dataset,rc as row_count from dataset_scan order by rc desc

Jobs by Month

with grp as ( select date_trunc('MONTH', run_id) as by_month from dataset_scan where run_id < now() ) select count(*) as owlchecks, by_month from grp group by by_month order by by_month desc

Rules by User

select count(*) as rules, user_nm from owl_rule group by user_nm order by rules desc

By Spark(Cluster) Usage

select * from opt_spark order by num_executors desc

Jobs IDs from Agent

select remote_job_id from agent_q where remote_job_id is not null

Dataset Activity

select dataset,run_id,total_time from dataset_activity where total_time is not null order by total_time desc

Jobs with Enriched Metrics

with activity as ( select dataset,run_id,total_time from dataset_activity where total_time is not null order by total_time desc limit 100), scans as ( select * from dataset_scan where dataset in (select dataset from activity) ), configs as ( select * from opt_spark where dataset in (select dataset from activity)), schema as ( select count(*) as col_cnt, dataset from dataset_schema where dataset in (select dataset from activity) group by dataset ) SELECT A.dataset, A.run_id, C.total_time, A.rc, D.col_cnt, B.driver_memory, B.num_executors,B.executor_cores, B.executor_memory, B.master FROM scans A INNER JOIN configs B ON A.dataset = B.dataset INNER JOIN activity C ON A.dataset = C.dataset and A.run_id = C.run_id INNER JOIN schema D on A.dataset = D.dataset ORDER BY C.total_time desc

Jobs. Load Times and Resources

with activity as ( select dataset,run_id,total_time from public.dataset_activity where total_time is not null order by total_time), scans as ( select * from public.dataset_scan where dataset in (select dataset from activity) ), configs as ( select * from public.opt_spark where dataset in (select dataset from activity)), schema as ( select count(*) as col_cnt, dataset from public.dataset_schema where dataset in (select dataset from activity) group by dataset ) SELECT A.dataset, A.run_id, A.updt_ts, C.total_time, A.rc, D.col_cnt, B.driver_memory, B.num_executors,B.executor_cores, B.executor_memory, B.master FROM scans A INNER JOIN configs B ON A.dataset = B.dataset INNER JOIN activity C ON A.dataset = C.dataset and A.run_id = C.run_id INNER JOIN schema D on A.dataset = D.dataset ORDER BY A.updt_ts desc limit 10

Dataset Scans and Scores By Schema

select * from public.dataset_scan where dataset like 'public.%';

Dataset Scans and Scores By Name

select * from public.dataset_scan where dataset ='public.atm_customer';

Scans By Month By Schema - 'Public'

select dataset, DATE_TRUNC('MONTH', run_id) as run_id, count(*) as Total_Scans from dataset_scan where dataset like 'public%' group by dataset, run_id order by run_id asc

Rule Breaks Past 30 Days

select * from rule_output where run_id < NOW() - INTERVAL '30 DAY';

Scheduled Jobs Queue

select job_id,agent_id,dataset,run_id,status,activity,start_time from public.owlcheck_q;

Column Counts from Dataset Schema

select dataset, count(*) from dataset_schema group by dataset;

Profiling Stats

select dataset, run_id, field_nm, (null_ratio * 100) as null_percent, (empty_ratio * 100) as empty_percent, ROUND( CAST( ( 100 - ((null_ratio * 100) + (empty_ratio * 100)) ) as numeric), 3) as completeness from public.dataset_field where updt_ts > '2020-06-01' and dataset = 'ProcessOrder' and run_id > '2021-03-17 00:00:00+00' order by completeness desc

Metadata / Schema / Datatypes

select * from public.dataset_schema;

Profile Stats

select * from public.dataset_field;

Locate Similar Columns

select distinct dataset, field_nm, max_abs from dataset_field where max_abs = 'Wireless Telecommunications'

Same Column Names

select distinct dataset, field_nm from dataset_field where field_nm = 'authenticated_user'

Similar Column Names

select distinct dataset,field_nm from dataset_field where field_nm like '%id%'

Behavior Findings

select * from behavior where dataset='esg_data'

All Columns for Schema from Postgres Stats

SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' ORDER BY table_name