percentile in range
percentile in range checks
Description
Column level check that ensures that the percentile of values in a monitored column is in a set range.
profile percentile in range
Check description
Verifies that the percentile of all values in a column is not outside the set range.
Check name | Check type | Time scale | Sensor definition | Quality rule |
---|---|---|---|---|
profile_percentile_in_range | profiling | percentile | between_floats |
Enable check (Shell)
To enable this check provide connection name and check name in check enable command
To run this check provide check name in check run command It is also possible to run this check on a specific connection. In order to do this, add the connection name to the below It is additionally feasible to run this check on a specific table. In order to do this, add the table name to the below It is furthermore viable to combine run this check on a specific column. In order to do this, add the column name to the below Check structure (Yaml)
profiling_checks:
numeric:
profile_percentile_in_range:
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
columns:
target_column:
profiling_checks:
numeric:
profile_percentile_in_range:
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
BigQuery
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc]
ORDER BY nested_table.[time_period], nested_table.[time_period_utc]
Configuration with data grouping
Click to see more
Sample configuration (Yaml)
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
default_grouping_name: group_by_country_and_state
groupings:
group_by_country_and_state:
level_1:
source: column_value
column: country
level_2:
source: column_value
column: state
columns:
target_column:
profiling_checks:
numeric:
profile_percentile_in_range:
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
country:
labels:
- column used as the first grouping key
state:
labels:
- column used as the second grouping key
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc,
analyzed_table.`country` AS grouping_level_1,
analyzed_table.`state` AS grouping_level_2
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
analyzed_table.`country` AS grouping_level_1
analyzed_table.`state` AS grouping_level_2
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table.grouping_level_1,
analyzed_table.grouping_level_2
,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
original_table."country" AS grouping_level_1,
original_table."state" AS grouping_level_2,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc,
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
FROM(
SELECT
PERCENTILE_CONT()
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc],
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
ORDER BY nested_table.[time_period], nested_table.[time_period_utc],
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
profile median in range
Check description
Verifies that the median of all values in a column is not outside the set range.
Check name | Check type | Time scale | Sensor definition | Quality rule |
---|---|---|---|---|
profile_median_in_range | profiling | percentile | between_floats |
Enable check (Shell)
To enable this check provide connection name and check name in check enable command
To run this check provide check name in check run command It is also possible to run this check on a specific connection. In order to do this, add the connection name to the below It is additionally feasible to run this check on a specific table. In order to do this, add the table name to the below It is furthermore viable to combine run this check on a specific column. In order to do this, add the column name to the below Check structure (Yaml)
profiling_checks:
numeric:
profile_median_in_range:
parameters:
percentile_value: 0.5
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
columns:
target_column:
profiling_checks:
numeric:
profile_median_in_range:
parameters:
percentile_value: 0.5
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
BigQuery
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
0.5)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc]
ORDER BY nested_table.[time_period], nested_table.[time_period_utc]
Configuration with data grouping
Click to see more
Sample configuration (Yaml)
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
default_grouping_name: group_by_country_and_state
groupings:
group_by_country_and_state:
level_1:
source: column_value
column: country
level_2:
source: column_value
column: state
columns:
target_column:
profiling_checks:
numeric:
profile_median_in_range:
parameters:
percentile_value: 0.5
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
country:
labels:
- column used as the first grouping key
state:
labels:
- column used as the second grouping key
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc,
analyzed_table.`country` AS grouping_level_1,
analyzed_table.`state` AS grouping_level_2
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
0.5)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
analyzed_table.`country` AS grouping_level_1
analyzed_table.`state` AS grouping_level_2
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table.grouping_level_1,
analyzed_table.grouping_level_2
,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
original_table."country" AS grouping_level_1,
original_table."state" AS grouping_level_2,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc,
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
FROM(
SELECT
PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc],
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
ORDER BY nested_table.[time_period], nested_table.[time_period_utc],
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
profile percentile 10 in range
Check description
Verifies that the percentile 10 of all values in a column is not outside the set range.
Check name | Check type | Time scale | Sensor definition | Quality rule |
---|---|---|---|---|
profile_percentile_10_in_range | profiling | percentile | between_floats |
Enable check (Shell)
To enable this check provide connection name and check name in check enable command
To run this check provide check name in check run command It is also possible to run this check on a specific connection. In order to do this, add the connection name to the below It is additionally feasible to run this check on a specific table. In order to do this, add the table name to the below It is furthermore viable to combine run this check on a specific column. In order to do this, add the column name to the below Check structure (Yaml)
profiling_checks:
numeric:
profile_percentile_10_in_range:
parameters:
percentile_value: 0.1
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
columns:
target_column:
profiling_checks:
numeric:
profile_percentile_10_in_range:
parameters:
percentile_value: 0.1
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
BigQuery
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
0.1)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc]
ORDER BY nested_table.[time_period], nested_table.[time_period_utc]
Configuration with data grouping
Click to see more
Sample configuration (Yaml)
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
default_grouping_name: group_by_country_and_state
groupings:
group_by_country_and_state:
level_1:
source: column_value
column: country
level_2:
source: column_value
column: state
columns:
target_column:
profiling_checks:
numeric:
profile_percentile_10_in_range:
parameters:
percentile_value: 0.1
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
country:
labels:
- column used as the first grouping key
state:
labels:
- column used as the second grouping key
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc,
analyzed_table.`country` AS grouping_level_1,
analyzed_table.`state` AS grouping_level_2
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
0.1)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
analyzed_table.`country` AS grouping_level_1
analyzed_table.`state` AS grouping_level_2
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table.grouping_level_1,
analyzed_table.grouping_level_2
,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
original_table."country" AS grouping_level_1,
original_table."state" AS grouping_level_2,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc,
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
FROM(
SELECT
PERCENTILE_CONT(0.1)
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc],
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
ORDER BY nested_table.[time_period], nested_table.[time_period_utc],
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
profile percentile 25 in range
Check description
Verifies that the percentile 25 of all values in a column is not outside the set range.
Check name | Check type | Time scale | Sensor definition | Quality rule |
---|---|---|---|---|
profile_percentile_25_in_range | profiling | percentile | between_floats |
Enable check (Shell)
To enable this check provide connection name and check name in check enable command
To run this check provide check name in check run command It is also possible to run this check on a specific connection. In order to do this, add the connection name to the below It is additionally feasible to run this check on a specific table. In order to do this, add the table name to the below It is furthermore viable to combine run this check on a specific column. In order to do this, add the column name to the below Check structure (Yaml)
profiling_checks:
numeric:
profile_percentile_25_in_range:
parameters:
percentile_value: 0.25
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
columns:
target_column:
profiling_checks:
numeric:
profile_percentile_25_in_range:
parameters:
percentile_value: 0.25
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
BigQuery
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
0.25)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc]
ORDER BY nested_table.[time_period], nested_table.[time_period_utc]
Configuration with data grouping
Click to see more
Sample configuration (Yaml)
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
default_grouping_name: group_by_country_and_state
groupings:
group_by_country_and_state:
level_1:
source: column_value
column: country
level_2:
source: column_value
column: state
columns:
target_column:
profiling_checks:
numeric:
profile_percentile_25_in_range:
parameters:
percentile_value: 0.25
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
country:
labels:
- column used as the first grouping key
state:
labels:
- column used as the second grouping key
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc,
analyzed_table.`country` AS grouping_level_1,
analyzed_table.`state` AS grouping_level_2
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
0.25)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
analyzed_table.`country` AS grouping_level_1
analyzed_table.`state` AS grouping_level_2
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table.grouping_level_1,
analyzed_table.grouping_level_2
,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
original_table."country" AS grouping_level_1,
original_table."state" AS grouping_level_2,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
analyzed_table."country" AS grouping_level_1,
analyzed_table."state" AS grouping_level_2,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY grouping_level_1, grouping_level_2, time_period, time_period_utc
ORDER BY grouping_level_1, grouping_level_2, time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc,
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
FROM(
SELECT
PERCENTILE_CONT(0.25)
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc],
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
ORDER BY nested_table.[time_period], nested_table.[time_period_utc],
analyzed_table.[country] AS grouping_level_1,
analyzed_table.[state] AS grouping_level_2
profile percentile 75 in range
Check description
Verifies that the percentile 75 of all values in a column is not outside the set range.
Check name | Check type | Time scale | Sensor definition | Quality rule |
---|---|---|---|---|
profile_percentile_75_in_range | profiling | percentile | between_floats |
Enable check (Shell)
To enable this check provide connection name and check name in check enable command
To run this check provide check name in check run command It is also possible to run this check on a specific connection. In order to do this, add the connection name to the below It is additionally feasible to run this check on a specific table. In order to do this, add the table name to the below It is furthermore viable to combine run this check on a specific column. In order to do this, add the column name to the below Check structure (Yaml)
profiling_checks:
numeric:
profile_percentile_75_in_range:
parameters:
percentile_value: 0.75
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
columns:
target_column:
profiling_checks:
numeric:
profile_percentile_75_in_range:
parameters:
percentile_value: 0.75
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
BigQuery
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(
(analyzed_table.`target_column`),
0.75)
OVER (PARTITION BY
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH),
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH))
) AS actual_value,
DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH) AS time_period,
TIMESTAMP(DATE_TRUNC(CAST(CURRENT_TIMESTAMP() AS DATE), MONTH)) AS time_period_utc
FROM `your-google-project-id`.`<target_schema>`.`<target_table>` AS analyzed_table) AS nested_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Oracle
{% import '/dialects/oracle.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections_reference('analyzed_table') }}
{{- lib.render_time_dimension_projection_reference('analyzed_table') }}
FROM(
SELECT
original_table.*
{{- lib.render_data_grouping_projections('original_table') }}
{{- lib.render_time_dimension_projection('original_table') }}
FROM {{ lib.render_target_table() }} original_table
{{- lib.render_where_clause(table_alias_prefix='original_table') }}
) analyzed_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.75)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
time_period,
time_period_utc
FROM(
SELECT
original_table.*,
TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS time_period,
CAST(TRUNC(CAST(CURRENT_TIMESTAMP AS DATE), 'MONTH') AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "<target_schema>"."<target_table>" original_table
) analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
PostgreSQL
{% import '/dialects/postgresql.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.75)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_postgresql_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Redshift
{% import '/dialects/redshift.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.75)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date)) AS time_period,
CAST((DATE_TRUNC('MONTH', CAST(LOCALTIMESTAMP AS date))) AS TIMESTAMP WITH TIME ZONE) AS time_period_utc
FROM "your_redshift_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
Snowflake
{% import '/dialects/snowflake.sql.jinja2' as lib with context -%}
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}}) AS actual_value
{{- lib.render_data_grouping_projections('analyzed_table') }}
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause() -}}
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}
SELECT
PERCENTILE_CONT(0.75)
WITHIN GROUP (ORDER BY analyzed_table."target_column") AS actual_value,
DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date)) AS time_period,
TO_TIMESTAMP(DATE_TRUNC('MONTH', CAST(TO_TIMESTAMP_NTZ(LOCALTIMESTAMP()) AS date))) AS time_period_utc
FROM "your_snowflake_database"."<target_schema>"."<target_table>" AS analyzed_table
GROUP BY time_period, time_period_utc
ORDER BY time_period, time_period_utc
SQL Server
{% import '/dialects/sqlserver.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if time_series is not none -%}
{{- lib.eol() -}}
{{- indentation -}}{{- lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{- indentation -}}CAST(({{- lib.render_time_dimension_expression(table_alias_prefix) }}) AS DATETIME)
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if data_groupings is not none and (data_groupings | length()) > 0 -%}
{%- for attribute in data_groupings -%}
{%- with data_grouping_level = data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{- lib.eol() -}}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{- lib.eol() -}}{{ indentation }}{{ table_alias_prefix }}.{{ quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT({{ parameters.percentile_value }})
WITHIN GROUP (ORDER BY {{ lib.render_target_column('analyzed_table')}})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
ORDER BY nested_table.[time_period], nested_table.[time_period_utc] {{- lib.render_data_grouping_projections('analyzed_table') }}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.[time_period] AS time_period,
nested_table.[time_period_utc] AS time_period_utc
FROM(
SELECT
PERCENTILE_CONT(0.75)
WITHIN GROUP (ORDER BY analyzed_table.[target_column])
OVER (PARTITION BY
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0),
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME)
) AS actual_value,
DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0) AS time_period,
CAST((DATEADD(month, DATEDIFF(month, 0, SYSDATETIMEOFFSET()), 0)) AS DATETIME) AS time_period_utc
FROM [your_sql_server_database].[<target_schema>].[<target_table>] AS analyzed_table) AS nested_table
GROUP BY nested_table.[time_period], nested_table.[time_period_utc]
ORDER BY nested_table.[time_period], nested_table.[time_period_utc]
Configuration with data grouping
Click to see more
Sample configuration (Yaml)
# yaml-language-server: $schema=https://cloud.dqo.ai/dqo-yaml-schema/TableYaml-schema.json
apiVersion: dqo/v1
kind: table
spec:
timestamp_columns:
event_timestamp_column: col_event_timestamp
ingestion_timestamp_column: col_inserted_at
incremental_time_window:
daily_partitioning_recent_days: 7
monthly_partitioning_recent_months: 1
default_grouping_name: group_by_country_and_state
groupings:
group_by_country_and_state:
level_1:
source: column_value
column: country
level_2:
source: column_value
column: state
columns:
target_column:
profiling_checks:
numeric:
profile_percentile_75_in_range:
parameters:
percentile_value: 0.75
warning:
from: 10.0
to: 20.5
error:
from: 10.0
to: 20.5
fatal:
from: 10.0
to: 20.5
labels:
- This is the column that is analyzed for data quality issues
col_event_timestamp:
labels:
- optional column that stores the timestamp when the event/transaction happened
col_inserted_at:
labels:
- optional column that stores the timestamp when row was ingested
country:
labels:
- column used as the first grouping key
state:
labels:
- column used as the second grouping key
{% import '/dialects/bigquery.sql.jinja2' as lib with context -%}
{%- macro render_local_time_dimension_projection(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.time_series is not none -%}
{{- lib.eol() -}}
{{ indentation }}{{ lib.render_time_dimension_expression(table_alias_prefix) }},{{ lib.eol() -}}
{{ indentation }}TIMESTAMP({{ lib.render_time_dimension_expression(table_alias_prefix) }})
{%- endif -%}
{%- endmacro -%}
{%- macro render_local_data_grouping_projections(table_alias_prefix = 'analyzed_table', indentation = ' ') -%}
{%- if lib.data_groupings is not none and (lib.data_groupings | length()) > 0 -%}
{%- for attribute in lib.data_groupings -%}
{%- with data_grouping_level = lib.data_groupings[attribute] -%}
{%- if data_grouping_level.source == 'tag' -%}
{{ lib.eol() }}{{ indentation }}{{ lib.make_text_constant(data_grouping_level.tag) }}
{%- elif data_grouping_level.source == 'column_value' -%}
{{ lib.eol() }}{{ indentation }}{{ table_alias_prefix }}.{{ lib.quote_identifier(data_grouping_level.column) }}
{%- endif -%}
{%- endwith %} AS grouping_{{ attribute }}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
SELECT
MAX(nested_table.actual_value) AS actual_value,
nested_table.`time_period` AS time_period,
nested_table.`time_period_utc` AS time_period_utc
{{- lib.render_data_grouping_projections('analyzed_table') }}
FROM(
SELECT
PERCENTILE_CONT(
({{ lib.render_target_column('analyzed_table')}}),
{{ parameters.percentile_value }})
OVER (PARTITION BY
{{render_local_time_dimension_projection('analyzed_table')}}
{{render_local_data_grouping_projections('analyzed_table') }}
) AS actual_value
{{- lib.render_time_dimension_projection('analyzed_table') }}
FROM {{ lib.render_target_table() }} AS analyzed_table
{{- lib.render_where_clause(indentation = ' ') -}}) AS nested_table
{{- lib.render_group_by() -}}
{{- lib.render_order_by() -}}