Define and run data-quality tests inside DuckDB
Maintainer(s):
vhe74
Installing and Loading
INSTALL dqtest FROM community;
LOAD dqtest;
Example
-- Initialize the data quality testing framework
CALL dq_init();
-- Create a sample table
CREATE TABLE customers (
id INTEGER PRIMARY KEY,
name VARCHAR,
email VARCHAR,
status VARCHAR,
age INTEGER
);
CREATE TABLE orders (
order_id INTEGER PRIMARY KEY,
customer_id INTEGER,
amount DECIMAL(10,2),
status VARCHAR
);
-- Insert sample data
INSERT INTO customers VALUES
(1, 'Alice', '[email protected]', 'active', 25),
(2, 'Bob', NULL, 'active', 30),
(3, 'Charlie', '[email protected]', 'inactive', 35);
INSERT INTO orders VALUES
(1, 1, 100.00, 'pending'),
(2, 1, 200.00, 'shipped'),
(3, 2, 150.00, 'delivered'),
(4, 99, 50.00, 'pending');
-- Define and run data quality tests
-- To pass a test should not return any lines
-- (Test definition syntax depends on your implementation)
INSERT INTO dq_tests (test_name, table_name, column_name, test_type)
VALUES ('customers_id_unique', 'customers', 'id', 'unique');
INSERT INTO dq_tests (test_name, table_name, column_name, test_type)
VALUES ('customers_email_not_null', 'customers', 'email', 'not_null');
INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_status_valid', 'customers', 'status', 'accepted_values',
'{"values": ["active", "inactive", "suspended"]}');
INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_email_format', 'customers', 'email', 'regex',
'{"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\\\.[a-zA-Z]{2,}$"}');
INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_age_range', 'customers', 'age', 'range',
'{"min": 18, "max": 100}');
INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('orders_customer_fk', 'orders', 'customer_id', 'relationship',
'{"to_table": "customers", "to_column": "id"}');
INSERT INTO dq_tests (test_name, table_name, test_type, test_params)
VALUES ('customers_min_rows', 'customers', 'row_count',
'{"min": 1, "max": 1000}');
INSERT INTO dq_tests (test_name, table_name, test_type, test_params)
VALUES ('orders_orphan_check', 'orders', 'custom_sql',
'{"sql": "SELECT * FROM {table} WHERE customer_id NOT IN (SELECT id FROM customers)"}');
-- Run all tests
CALL dq_run_tests();
-- View test results
SELECT t.test_name, r.status, r.executed_at
FROM dq_tests t
LEFT JOIN dq_test_results r ON t.test_id = r.test_id;
-- View test results of the last execution
SELECT t.test_name, r.status, r.executed_at
FROM dq_tests t
LEFT JOIN dq_test_results r ON t.test_id = r.test_id
QUALIFY ROW_NUMBER() OVER (PARTITION BY t.test_name ORDER BY r.executed_at DESC NULLS LAST) = 1;
-- view all compiled sql of the last execution
SELECT t.test_name, r.status, r.compiled_sql FROM dq_tests t
LEFT JOIN dq_test_results r ON t.test_id = r.test_id
QUALIFY ROW_NUMBER() OVER (PARTITION BY t.test_name ORDER BY r.executed_at DESC NULLS LAST) = 1;
Added Functions
| function_name | function_type | description | comment | examples |
|---|---|---|---|---|
| dq_init | table | NULL | NULL | |
| dq_run_tests | table | NULL | NULL |