Search Shortcut cmd + k | ctrl + k
dqtest

Define and run data-quality tests inside DuckDB

Maintainer(s): vhe74

Installing and Loading

INSTALL dqtest FROM community;
LOAD dqtest;

Example

-- Initialize the data quality testing framework
CALL dq_init();

-- Create a sample table
CREATE TABLE customers (
    id INTEGER PRIMARY KEY,
    name VARCHAR,
    email VARCHAR,
    status VARCHAR,
    age INTEGER
);

CREATE TABLE orders (
    order_id INTEGER PRIMARY KEY,
    customer_id INTEGER,
    amount DECIMAL(10,2),
    status VARCHAR
);

-- Insert sample data
INSERT INTO customers VALUES
    (1, 'Alice', '[email protected]', 'active', 25),
    (2, 'Bob', NULL, 'active', 30),
    (3, 'Charlie', '[email protected]', 'inactive', 35);

INSERT INTO orders VALUES
    (1, 1, 100.00, 'pending'),
    (2, 1, 200.00, 'shipped'),
    (3, 2, 150.00, 'delivered'),
    (4, 99, 50.00, 'pending');

-- Define and run data quality tests
-- To pass a test should not return any lines
-- (Test definition syntax depends on your implementation)
INSERT INTO dq_tests (test_name, table_name, column_name, test_type)
VALUES ('customers_id_unique', 'customers', 'id', 'unique');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type)
  VALUES ('customers_email_not_null', 'customers', 'email', 'not_null');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_status_valid', 'customers', 'status', 'accepted_values',
        '{"values": ["active", "inactive", "suspended"]}');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_email_format', 'customers', 'email', 'regex',
        '{"pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\\\.[a-zA-Z]{2,}$"}');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('customers_age_range', 'customers', 'age', 'range',
        '{"min": 18, "max": 100}');

INSERT INTO dq_tests (test_name, table_name, column_name, test_type, test_params)
VALUES ('orders_customer_fk', 'orders', 'customer_id', 'relationship',
        '{"to_table": "customers", "to_column": "id"}');

INSERT INTO dq_tests (test_name, table_name, test_type, test_params)
VALUES ('customers_min_rows', 'customers', 'row_count',
        '{"min": 1, "max": 1000}');

INSERT INTO dq_tests (test_name, table_name, test_type, test_params)
VALUES ('orders_orphan_check', 'orders', 'custom_sql',
        '{"sql": "SELECT * FROM {table} WHERE customer_id NOT IN (SELECT id FROM customers)"}');

-- Run all tests
CALL dq_run_tests();

-- View test results
SELECT t.test_name, r.status, r.executed_at
  FROM dq_tests t 
  LEFT JOIN dq_test_results r ON t.test_id = r.test_id;

-- View test results of the last execution 
SELECT t.test_name, r.status, r.executed_at
  FROM dq_tests t 
  LEFT JOIN dq_test_results r ON t.test_id = r.test_id
  QUALIFY ROW_NUMBER() OVER (PARTITION BY t.test_name ORDER BY r.executed_at DESC NULLS LAST) = 1;

-- view all compiled sql of the last execution 
SELECT t.test_name, r.status, r.compiled_sql  FROM dq_tests t 
  LEFT JOIN dq_test_results r ON t.test_id = r.test_id
  QUALIFY ROW_NUMBER() OVER (PARTITION BY t.test_name ORDER BY r.executed_at DESC NULLS LAST) = 1;

Added Functions

function_name function_type description comment examples
dq_init table NULL NULL  
dq_run_tests table NULL NULL  

Overloaded Functions

This extension does not add any function overloads.

Added Types

This extension does not add any types.

Added Settings

This extension does not add any settings.