Search Shortcut cmd + k | ctrl + k
dqtest

Define and run data-quality tests inside DuckDB

Maintainer(s): vhe74

Installing and Loading

INSTALL dqtest FROM community;
LOAD dqtest;

Example

-- Initialize the data quality testing framework
CALL dq_init();

-- Create a sample table
CREATE TABLE customers (
    id INTEGER PRIMARY KEY,
    name VARCHAR,
    email VARCHAR,
    status VARCHAR,
    age INTEGER
);

CREATE TABLE orders (
    order_id INTEGER PRIMARY KEY,
    customer_id INTEGER,
    amount DECIMAL(10,2),
    status VARCHAR
);

-- Insert sample data
INSERT INTO customers VALUES
    (1, 'Alice', '[email protected]', 'active', 25),
    (2, 'Bob', NULL, 'active', 30),
    (3, 'Charlie', '[email protected]', 'inactive', 35);

INSERT INTO orders VALUES
    (1, 1, 100.00, 'pending'),
    (2, 1, 200.00, 'shipped'),
    (3, 2, 150.00, 'delivered'),
    (4, 99, 50.00, 'pending');

-- Define and run data quality tests
-- To pass a test should not return any lines
-- (Test definition syntax depends on your implementation)
INSERT INTO dq_tests (test_name, table_name, column_name, test_type)
VALUES ('customers_id_unique', 'customers', 'id', 'unique');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type)
  VALUES ('customers_email_not_null', 'customers', 'email', 'not_null');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_status_valid', 'customers', 'status', 'accepted_values',
        '{"values": ["active", "inactive", "suspended"]}');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_email_format', 'customers', 'email', 'regex',
        '{"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\\\.[a-zA-Z]{2,}$"}');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_age_range', 'customers', 'age', 'range',
        '{"min": 18, "max": 100}');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('orders_customer_fk', 'orders', 'customer_id', 'relationship',
        '{"to_table": "customers", "to_column": "id"}');

INSERT INTO dq_tests (test_name, table_name, test_type, test_params)
VALUES ('customers_min_rows', 'customers', 'row_count',
        '{"min": 1, "max": 1000}');

INSERT INTO dq_tests (test_name, table_name, test_type, test_params)
VALUES ('orders_orphan_check', 'orders', 'custom_sql',
        '{"sql": "SELECT * FROM {table} WHERE customer_id NOT IN (SELECT id FROM customers)"}');

-- Run all tests
CALL dq_run_tests();

-- View test results
SELECT t.test_name, r.status, r.executed_at
  FROM dq_tests t 
  LEFT JOIN dq_test_results r ON t.test_id = r.test_id;

-- View test results of the last execution 
SELECT t.test_name, r.status, r.executed_at
  FROM dq_tests t 
  LEFT JOIN dq_test_results r ON t.test_id = r.test_id
  QUALIFY ROW_NUMBER() OVER (PARTITION BY t.test_name ORDER BY r.executed_at DESC NULLS LAST) = 1;

-- view all compiled sql of the last execution 
SELECT t.test_name, r.status, r.compiled_sql  FROM dq_tests t 
  LEFT JOIN dq_test_results r ON t.test_id = r.test_id
  QUALIFY ROW_NUMBER() OVER (PARTITION BY t.test_name ORDER BY r.executed_at DESC NULLS LAST) = 1;

Added Functions

function_name function_type description comment examples
dq_init table NULL NULL  
dq_run_tests table NULL NULL