Files
gh-raw-labs-claude-code-mar…/skills/mxcp-expert/references/synthetic-data-patterns.md
2025-11-30 08:49:50 +08:00

17 KiB

Synthetic Data Generation Patterns

Guide for creating synthetic data in DuckDB and MXCP for testing, demos, and development.

Overview

Synthetic data is useful for:

  • Testing - Validate tools without real data
  • Demos - Show functionality with realistic-looking data
  • Development - Build endpoints before real data is available
  • Privacy - Mask or replace sensitive data
  • Performance testing - Generate large datasets

DuckDB Synthetic Data Functions

GENERATE_SERIES

Create sequences of numbers or dates:

-- Generate 1000 rows with sequential IDs
SELECT * FROM GENERATE_SERIES(1, 1000) AS t(id)

-- Generate date range
SELECT * FROM GENERATE_SERIES(
  DATE '2024-01-01',
  DATE '2024-12-31',
  INTERVAL '1 day'
) AS t(date)

-- Generate timestamp range (hourly)
SELECT * FROM GENERATE_SERIES(
  TIMESTAMP '2024-01-01 00:00:00',
  TIMESTAMP '2024-01-31 23:59:59',
  INTERVAL '1 hour'
) AS t(timestamp)

Random Functions

Generate random values:

-- Random integer between 1 and 100
SELECT FLOOR(RANDOM() * 100 + 1)::INTEGER AS random_int

-- Random float between 0 and 1
SELECT RANDOM() AS random_float

-- Random UUID
SELECT UUID() AS id

-- Random boolean
SELECT RANDOM() < 0.5 AS random_bool

-- Random element from array
SELECT LIST_ELEMENT(['A', 'B', 'C'], FLOOR(RANDOM() * 3 + 1)::INTEGER) AS random_choice

String Generation

-- Random string from characters
SELECT
  'USER_' || UUID() AS user_id,
  'user' || FLOOR(RANDOM() * 10000)::INTEGER || '@example.com' AS email,
  LIST_ELEMENT(['John', 'Jane', 'Alice', 'Bob'], FLOOR(RANDOM() * 4 + 1)::INTEGER) AS first_name,
  LIST_ELEMENT(['Smith', 'Doe', 'Johnson', 'Williams'], FLOOR(RANDOM() * 4 + 1)::INTEGER) AS last_name

Common Synthetic Data Patterns

Pattern 1: Customer Records

-- Generate 1000 synthetic customers
CREATE TABLE customers AS
SELECT
  ROW_NUMBER() OVER () AS customer_id,
  'CUST_' || UUID() AS customer_code,
  first_name || ' ' || last_name AS full_name,
  LOWER(first_name) || '.' || LOWER(last_name) || '@example.com' AS email,
  CASE
    WHEN RANDOM() < 0.3 THEN 'bronze'
    WHEN RANDOM() < 0.7 THEN 'silver'
    ELSE 'gold'
  END AS tier,
  DATE '2020-01-01' + (RANDOM() * 1460)::INTEGER * INTERVAL '1 day' AS signup_date,
  FLOOR(RANDOM() * 100000 + 10000)::INTEGER / 100.0 AS lifetime_value,
  RANDOM() < 0.9 AS is_active
FROM GENERATE_SERIES(1, 1000) AS t(id)
CROSS JOIN (
  SELECT unnest(['John', 'Jane', 'Alice', 'Bob', 'Charlie', 'Diana']) AS first_name
) AS names1
CROSS JOIN (
  SELECT unnest(['Smith', 'Doe', 'Johnson', 'Williams', 'Brown', 'Jones']) AS last_name
) AS names2
LIMIT 1000;

Pattern 2: Transaction/Sales Data

-- Generate 10,000 synthetic transactions
CREATE TABLE transactions AS
SELECT
  ROW_NUMBER() OVER (ORDER BY transaction_date) AS transaction_id,
  'TXN_' || UUID() AS transaction_code,
  FLOOR(RANDOM() * 1000 + 1)::INTEGER AS customer_id,
  transaction_date,
  FLOOR(RANDOM() * 50000 + 1000)::INTEGER / 100.0 AS amount,
  LIST_ELEMENT(['credit_card', 'debit_card', 'bank_transfer', 'paypal'], FLOOR(RANDOM() * 4 + 1)::INTEGER) AS payment_method,
  LIST_ELEMENT(['completed', 'pending', 'failed'], FLOOR(RANDOM() * 10 + 1)::INTEGER) AS status,
  LIST_ELEMENT(['electronics', 'clothing', 'food', 'books', 'home'], FLOOR(RANDOM() * 5 + 1)::INTEGER) AS category
FROM GENERATE_SERIES(
  TIMESTAMP '2024-01-01 00:00:00',
  TIMESTAMP '2024-12-31 23:59:59',
  INTERVAL '52 minutes'  -- Roughly 10k records over a year
) AS t(transaction_date);

Pattern 3: Time Series Data

-- Generate hourly metrics for a year
CREATE TABLE metrics AS
SELECT
  timestamp,
  -- Simulated daily pattern (peak at 2pm)
  50 + 30 * SIN(2 * PI() * EXTRACT(hour FROM timestamp) / 24 - PI()/2) + RANDOM() * 20 AS requests_per_min,
  -- Random response time between 50-500ms
  FLOOR(RANDOM() * 450 + 50)::INTEGER AS avg_response_ms,
  -- Error rate 0-5%
  RANDOM() * 5 AS error_rate,
  -- Random CPU usage
  FLOOR(RANDOM() * 60 + 20)::INTEGER AS cpu_usage_pct
FROM GENERATE_SERIES(
  TIMESTAMP '2024-01-01 00:00:00',
  TIMESTAMP '2024-12-31 23:59:59',
  INTERVAL '1 hour'
) AS t(timestamp);

Pattern 4: Relational Data with Foreign Keys

-- Create related tables: Users → Orders → Order Items

-- Users
CREATE TABLE users AS
SELECT
  user_id,
  'user' || user_id || '@example.com' AS email,
  DATE '2020-01-01' + (RANDOM() * 1460)::INTEGER * INTERVAL '1 day' AS created_at
FROM GENERATE_SERIES(1, 100) AS t(user_id);

-- Orders
CREATE TABLE orders AS
SELECT
  order_id,
  FLOOR(RANDOM() * 100 + 1)::INTEGER AS user_id,  -- FK to users
  order_date,
  LIST_ELEMENT(['pending', 'shipped', 'delivered'], FLOOR(RANDOM() * 3 + 1)::INTEGER) AS status
FROM GENERATE_SERIES(1, 500) AS t(order_id)
CROSS JOIN (
  SELECT DATE '2024-01-01' + (RANDOM() * 365)::INTEGER * INTERVAL '1 day' AS order_date
) AS dates;

-- Order Items
CREATE TABLE order_items AS
SELECT
  ROW_NUMBER() OVER () AS item_id,
  order_id,
  'PRODUCT_' || FLOOR(RANDOM() * 50 + 1)::INTEGER AS product_id,
  FLOOR(RANDOM() * 5 + 1)::INTEGER AS quantity,
  FLOOR(RANDOM() * 20000 + 500)::INTEGER / 100.0 AS price
FROM orders
CROSS JOIN GENERATE_SERIES(1, FLOOR(RANDOM() * 5 + 1)::INTEGER) AS t(n);

Pattern 5: Geographic Data

-- Generate synthetic locations
CREATE TABLE locations AS
SELECT
  location_id,
  LIST_ELEMENT(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], FLOOR(RANDOM() * 5 + 1)::INTEGER) AS city,
  LIST_ELEMENT(['NY', 'CA', 'IL', 'TX', 'AZ'], FLOOR(RANDOM() * 5 + 1)::INTEGER) AS state,
  -- Random US ZIP code
  LPAD(FLOOR(RANDOM() * 99999)::INTEGER::VARCHAR, 5, '0') AS zip_code,
  -- Random coordinates (simplified for demo)
  ROUND((RANDOM() * 50 + 25)::DECIMAL, 6) AS latitude,
  ROUND((RANDOM() * 60 - 125)::DECIMAL, 6) AS longitude
FROM GENERATE_SERIES(1, 200) AS t(location_id);

MXCP Integration Patterns

Pattern 1: dbt Model for Synthetic Data

Use case: Generate test data that persists across runs

-- models/synthetic_customers.sql
{{ config(materialized='table') }}

WITH name_options AS (
  SELECT unnest(['John', 'Jane', 'Alice', 'Bob', 'Charlie']) AS first_name
), surname_options AS (
  SELECT unnest(['Smith', 'Doe', 'Johnson', 'Brown']) AS last_name
)
SELECT
  ROW_NUMBER() OVER () AS customer_id,
  first_name || ' ' || last_name AS full_name,
  LOWER(first_name) || '.' || LOWER(last_name) || '@example.com' AS email,
  DATE '2020-01-01' + (RANDOM() * 1000)::INTEGER * INTERVAL '1 day' AS signup_date
FROM name_options
CROSS JOIN surname_options
CROSS JOIN GENERATE_SERIES(1, 50)  -- 5 * 4 * 50 = 1000 customers
# models/schema.yml
version: 2

models:
  - name: synthetic_customers
    description: "Synthetic customer data for testing"
    columns:
      - name: customer_id
        tests: [unique, not_null]
      - name: email
        tests: [unique, not_null]

Build and query:

dbt run --select synthetic_customers
# tools/query_test_customers.yml
mxcp: 1
tool:
  name: query_test_customers
  description: "Query synthetic customer data"
  return:
    type: array
  source:
    code: |
      SELECT * FROM synthetic_customers LIMIT 100

Pattern 2: Python Tool for Dynamic Generation

Use case: Generate data on-the-fly based on parameters

# python/data_generator.py
from mxcp.runtime import db
import uuid
from datetime import datetime, timedelta
import random

def generate_transactions(
    count: int = 100,
    start_date: str = "2024-01-01",
    end_date: str = "2024-12-31"
) -> dict:
    """Generate synthetic transaction data"""

    # Create temporary table
    table_name = f"temp_transactions_{uuid.uuid4().hex[:8]}"

    # Parse dates
    start = datetime.fromisoformat(start_date)
    end = datetime.fromisoformat(end_date)
    date_range = (end - start).days

    db.execute(f"""
        CREATE TABLE {table_name} AS
        SELECT
          ROW_NUMBER() OVER () AS id,
          DATE '{start_date}' + (RANDOM() * {date_range})::INTEGER * INTERVAL '1 day' AS transaction_date,
          FLOOR(RANDOM() * 100000 + 1000)::INTEGER / 100.0 AS amount,
          LIST_ELEMENT(['completed', 'pending', 'failed'], FLOOR(RANDOM() * 10 + 1)::INTEGER) AS status
        FROM GENERATE_SERIES(1, {count})
    """)

    # Get sample
    sample = db.execute(f"SELECT * FROM {table_name} LIMIT 10").fetchall()

    return {
        "table_name": table_name,
        "rows_generated": count,
        "sample": sample,
        "query_hint": f"SELECT * FROM {table_name}"
    }

def generate_customers(count: int = 100) -> dict:
    """Generate synthetic customer records"""

    table_name = f"temp_customers_{uuid.uuid4().hex[:8]}"

    first_names = ['John', 'Jane', 'Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank']
    last_names = ['Smith', 'Doe', 'Johnson', 'Williams', 'Brown', 'Jones', 'Miller']
    tiers = ['bronze', 'silver', 'gold', 'platinum']

    db.execute(f"""
        CREATE TABLE {table_name} AS
        WITH names AS (
          SELECT
            unnest({first_names}) AS first_name,
            unnest({last_names}) AS last_name
        )
        SELECT
          ROW_NUMBER() OVER () AS customer_id,
          first_name || ' ' || last_name AS full_name,
          LOWER(first_name) || '.' || LOWER(last_name) || FLOOR(RANDOM() * 1000)::INTEGER || '@example.com' AS email,
          LIST_ELEMENT({tiers}, FLOOR(RANDOM() * {len(tiers)} + 1)::INTEGER) AS tier,
          DATE '2020-01-01' + (RANDOM() * 1460)::INTEGER * INTERVAL '1 day' AS created_at
        FROM names
        CROSS JOIN GENERATE_SERIES(1, CEIL({count} / (SELECT COUNT(*) FROM names))::INTEGER)
        LIMIT {count}
    """)

    stats = db.execute(f"""
        SELECT
          COUNT(*) as total,
          COUNT(DISTINCT tier) as tiers,
          MIN(created_at) as earliest,
          MAX(created_at) as latest
        FROM {table_name}
    """).fetchone()

    return {
        "table_name": table_name,
        "rows_generated": stats["total"],
        "statistics": dict(stats),
        "query_hint": f"SELECT * FROM {table_name}"
    }
# tools/generate_test_data.yml
mxcp: 1
tool:
  name: generate_test_data
  description: "Generate synthetic data for testing"
  language: python
  parameters:
    - name: data_type
      type: string
      examples: ["transactions", "customers"]
    - name: count
      type: integer
      default: 100
  return:
    type: object
  source:
    file: ../python/data_generator.py
    function: |
      if data_type == "transactions":
          return generate_transactions(count)
      elif data_type == "customers":
          return generate_customers(count)
      else:
          raise ValueError(f"Unknown data_type: {data_type}")

Pattern 3: Statistics Tool for Synthetic Data

Use case: Generate data and immediately calculate statistics

# tools/synthetic_analytics.yml
mxcp: 1
tool:
  name: synthetic_analytics
  description: "Generate synthetic sales data and calculate statistics"
  language: python
  parameters:
    - name: days
      type: integer
      default: 365
    - name: transactions_per_day
      type: integer
      default: 100
  return:
    type: object
    properties:
      daily_stats: { type: array }
      overall_stats: { type: object }
  source:
    code: |
      from mxcp.runtime import db

      total = days * transactions_per_day

      # Generate data
      db.execute(f"""
        CREATE OR REPLACE TEMP TABLE temp_sales AS
        SELECT
          DATE '2024-01-01' + (RANDOM() * {days})::INTEGER * INTERVAL '1 day' AS sale_date,
          FLOOR(RANDOM() * 50000 + 1000)::INTEGER / 100.0 AS amount,
          LIST_ELEMENT(['online', 'retail', 'wholesale'], FLOOR(RANDOM() * 3 + 1)::INTEGER) AS channel
        FROM GENERATE_SERIES(1, {total})
      """)

      # Calculate statistics
      daily_stats = db.execute("""
        SELECT
          sale_date,
          COUNT(*) as transactions,
          SUM(amount) as total_sales,
          AVG(amount) as avg_sale
        FROM temp_sales
        GROUP BY sale_date
        ORDER BY sale_date
      """).fetchall()

      overall = db.execute("""
        SELECT
          COUNT(*) as total_transactions,
          SUM(amount) as total_revenue,
          AVG(amount) as avg_transaction,
          MIN(amount) as min_transaction,
          MAX(amount) as max_transaction,
          STDDEV(amount) as std_dev
        FROM temp_sales
      """).fetchone()

      return {
        "daily_stats": daily_stats,
        "overall_stats": dict(overall)
      }

Advanced Patterns

Realistic Distributions

Normal distribution (for things like heights, test scores):

-- Box-Muller transform for normal distribution
SELECT
  SQRT(-2 * LN(RANDOM())) * COS(2 * PI() * RANDOM()) * 15 + 100 AS iq_score
FROM GENERATE_SERIES(1, 1000)

Power law distribution (for things like city populations):

SELECT
  FLOOR(POWER(RANDOM(), -0.5) * 1000)::INTEGER AS followers
FROM GENERATE_SERIES(1, 1000)

Seasonal patterns:

-- Sales with seasonal pattern (peak in Dec, low in Feb)
SELECT
  date,
  -- Base level + seasonal component + random noise
  1000 + 500 * SIN(2 * PI() * EXTRACT(month FROM date) / 12 - PI()/2) + RANDOM() * 200 AS daily_sales
FROM GENERATE_SERIES(DATE '2024-01-01', DATE '2024-12-31', INTERVAL '1 day') AS t(date)

Data Masking/Anonymization

Replace real data with synthetic:

-- Anonymize customer data
CREATE TABLE customers_anonymized AS
SELECT
  customer_id,  -- Keep ID for joins
  'USER_' || customer_id || '@example.com' AS email,  -- Fake email
  LIST_ELEMENT(['John', 'Jane', 'Alice', 'Bob'], (customer_id % 4) + 1) AS first_name,  -- Fake name
  LEFT(phone, 3) || '-XXX-XXXX' AS masked_phone,  -- Mask phone
  FLOOR(age / 10) * 10 AS age_bucket  -- Generalize age
FROM customers_real;

Complete Example: Synthetic Analytics Server

Scenario: Demo server with synthetic e-commerce data

# Project structure
synthetic-analytics/
├── mxcp-site.yml
├── models/
│   ├── synthetic_customers.sql
│   ├── synthetic_orders.sql
│   └── schema.yml
├── python/
│   └── generators.py
└── tools/
    ├── generate_data.yml
    ├── customer_analytics.yml
    └── sales_trends.yml
-- models/synthetic_customers.sql
{{ config(materialized='table') }}

SELECT
  customer_id,
  'customer' || customer_id || '@example.com' AS email,
  LIST_ELEMENT(['bronze', 'silver', 'gold'], (customer_id % 3) + 1) AS tier,
  DATE '2020-01-01' + (RANDOM() * 1000)::INTEGER * INTERVAL '1 day' AS signup_date
FROM GENERATE_SERIES(1, 500) AS t(customer_id)
-- models/synthetic_orders.sql
{{ config(materialized='table') }}

SELECT
  order_id,
  FLOOR(RANDOM() * 500 + 1)::INTEGER AS customer_id,
  order_date,
  FLOOR(RANDOM() * 100000 + 1000)::INTEGER / 100.0 AS amount,
  LIST_ELEMENT(['completed', 'shipped', 'pending'], FLOOR(RANDOM() * 3 + 1)::INTEGER) AS status
FROM GENERATE_SERIES(1, 5000) AS t(order_id)
CROSS JOIN (
  SELECT DATE '2024-01-01' + (RANDOM() * 365)::INTEGER * INTERVAL '1 day' AS order_date
) AS dates
# tools/customer_analytics.yml
mxcp: 1
tool:
  name: customer_analytics
  description: "Get customer analytics from synthetic data"
  parameters:
    - name: tier
      type: string
      required: false
  return:
    type: array
  source:
    code: |
      SELECT
        c.tier,
        COUNT(DISTINCT c.customer_id) as customers,
        COUNT(o.order_id) as total_orders,
        SUM(o.amount) as total_revenue,
        AVG(o.amount) as avg_order_value
      FROM synthetic_customers c
      LEFT JOIN synthetic_orders o ON c.customer_id = o.customer_id
      WHERE $tier IS NULL OR c.tier = $tier
      GROUP BY c.tier
      ORDER BY total_revenue DESC

Best Practices

  1. Use dbt for persistent data: Synthetic data that should be consistent across queries
  2. Use Python for dynamic data: Data that changes based on parameters
  3. Seed random number generator: For reproducible results, use SETSEED() in DuckDB
  4. Realistic distributions: Use appropriate statistical distributions
  5. Maintain referential integrity: Ensure foreign keys match
  6. Add noise: Real data isn't perfectly distributed, add randomness
  7. Document data generation: Explain how synthetic data was created
  8. Test with synthetic first: Validate tools before using real data

Summary

For synthetic data in MXCP:

  1. DuckDB patterns: GENERATE_SERIES, RANDOM(), LIST_ELEMENT(), UUID()
  2. dbt models: For persistent, version-controlled synthetic data
  3. Python tools: For dynamic generation based on parameters
  4. Statistics: Generate data → calculate metrics in one tool
  5. Testing: Use synthetic data to test tools before real data
  6. Privacy: Anonymize real data by generating synthetic replacements