Initial commit
This commit is contained in:
579
skills/mxcp-expert/references/synthetic-data-patterns.md
Normal file
579
skills/mxcp-expert/references/synthetic-data-patterns.md
Normal file
@@ -0,0 +1,579 @@
|
||||
# Synthetic Data Generation Patterns
|
||||
|
||||
Guide for creating synthetic data in DuckDB and MXCP for testing, demos, and development.
|
||||
|
||||
## Overview
|
||||
|
||||
Synthetic data is useful for:
|
||||
- **Testing** - Validate tools without real data
|
||||
- **Demos** - Show functionality with realistic-looking data
|
||||
- **Development** - Build endpoints before real data is available
|
||||
- **Privacy** - Mask or replace sensitive data
|
||||
- **Performance testing** - Generate large datasets
|
||||
|
||||
## DuckDB Synthetic Data Functions
|
||||
|
||||
### GENERATE_SERIES
|
||||
|
||||
**Create sequences of numbers or dates**:
|
||||
|
||||
```sql
|
||||
-- Generate 1000 rows with sequential IDs
|
||||
SELECT * FROM GENERATE_SERIES(1, 1000) AS t(id)
|
||||
|
||||
-- Generate date range
|
||||
SELECT * FROM GENERATE_SERIES(
|
||||
DATE '2024-01-01',
|
||||
DATE '2024-12-31',
|
||||
INTERVAL '1 day'
|
||||
) AS t(date)
|
||||
|
||||
-- Generate timestamp range (hourly)
|
||||
SELECT * FROM GENERATE_SERIES(
|
||||
TIMESTAMP '2024-01-01 00:00:00',
|
||||
TIMESTAMP '2024-01-31 23:59:59',
|
||||
INTERVAL '1 hour'
|
||||
) AS t(timestamp)
|
||||
```
|
||||
|
||||
### Random Functions
|
||||
|
||||
**Generate random values**:
|
||||
|
||||
```sql
|
||||
-- Random integer between 1 and 100
|
||||
SELECT FLOOR(RANDOM() * 100 + 1)::INTEGER AS random_int
|
||||
|
||||
-- Random float between 0 and 1
|
||||
SELECT RANDOM() AS random_float
|
||||
|
||||
-- Random UUID
|
||||
SELECT UUID() AS id
|
||||
|
||||
-- Random boolean
|
||||
SELECT RANDOM() < 0.5 AS random_bool
|
||||
|
||||
-- Random element from array
|
||||
SELECT LIST_ELEMENT(['A', 'B', 'C'], FLOOR(RANDOM() * 3 + 1)::INTEGER) AS random_choice
|
||||
```
|
||||
|
||||
### String Generation
|
||||
|
||||
```sql
|
||||
-- Random string from characters
|
||||
SELECT
|
||||
'USER_' || UUID() AS user_id,
|
||||
'user' || FLOOR(RANDOM() * 10000)::INTEGER || '@example.com' AS email,
|
||||
LIST_ELEMENT(['John', 'Jane', 'Alice', 'Bob'], FLOOR(RANDOM() * 4 + 1)::INTEGER) AS first_name,
|
||||
LIST_ELEMENT(['Smith', 'Doe', 'Johnson', 'Williams'], FLOOR(RANDOM() * 4 + 1)::INTEGER) AS last_name
|
||||
```
|
||||
|
||||
## Common Synthetic Data Patterns
|
||||
|
||||
### Pattern 1: Customer Records
|
||||
|
||||
```sql
|
||||
-- Generate 1000 synthetic customers
|
||||
CREATE TABLE customers AS
|
||||
SELECT
|
||||
ROW_NUMBER() OVER () AS customer_id,
|
||||
'CUST_' || UUID() AS customer_code,
|
||||
first_name || ' ' || last_name AS full_name,
|
||||
LOWER(first_name) || '.' || LOWER(last_name) || '@example.com' AS email,
|
||||
CASE
|
||||
WHEN RANDOM() < 0.3 THEN 'bronze'
|
||||
WHEN RANDOM() < 0.7 THEN 'silver'
|
||||
ELSE 'gold'
|
||||
END AS tier,
|
||||
DATE '2020-01-01' + (RANDOM() * 1460)::INTEGER * INTERVAL '1 day' AS signup_date,
|
||||
FLOOR(RANDOM() * 100000 + 10000)::INTEGER / 100.0 AS lifetime_value,
|
||||
RANDOM() < 0.9 AS is_active
|
||||
FROM GENERATE_SERIES(1, 1000) AS t(id)
|
||||
CROSS JOIN (
|
||||
SELECT unnest(['John', 'Jane', 'Alice', 'Bob', 'Charlie', 'Diana']) AS first_name
|
||||
) AS names1
|
||||
CROSS JOIN (
|
||||
SELECT unnest(['Smith', 'Doe', 'Johnson', 'Williams', 'Brown', 'Jones']) AS last_name
|
||||
) AS names2
|
||||
LIMIT 1000;
|
||||
```
|
||||
|
||||
### Pattern 2: Transaction/Sales Data
|
||||
|
||||
```sql
|
||||
-- Generate 10,000 synthetic transactions
|
||||
CREATE TABLE transactions AS
|
||||
SELECT
|
||||
ROW_NUMBER() OVER (ORDER BY transaction_date) AS transaction_id,
|
||||
'TXN_' || UUID() AS transaction_code,
|
||||
FLOOR(RANDOM() * 1000 + 1)::INTEGER AS customer_id,
|
||||
transaction_date,
|
||||
FLOOR(RANDOM() * 50000 + 1000)::INTEGER / 100.0 AS amount,
|
||||
LIST_ELEMENT(['credit_card', 'debit_card', 'bank_transfer', 'paypal'], FLOOR(RANDOM() * 4 + 1)::INTEGER) AS payment_method,
|
||||
LIST_ELEMENT(['completed', 'pending', 'failed'], FLOOR(RANDOM() * 10 + 1)::INTEGER) AS status,
|
||||
LIST_ELEMENT(['electronics', 'clothing', 'food', 'books', 'home'], FLOOR(RANDOM() * 5 + 1)::INTEGER) AS category
|
||||
FROM GENERATE_SERIES(
|
||||
TIMESTAMP '2024-01-01 00:00:00',
|
||||
TIMESTAMP '2024-12-31 23:59:59',
|
||||
INTERVAL '52 minutes' -- Roughly 10k records over a year
|
||||
) AS t(transaction_date);
|
||||
```
|
||||
|
||||
### Pattern 3: Time Series Data
|
||||
|
||||
```sql
|
||||
-- Generate hourly metrics for a year
|
||||
CREATE TABLE metrics AS
|
||||
SELECT
|
||||
timestamp,
|
||||
-- Simulated daily pattern (peak at 2pm)
|
||||
50 + 30 * SIN(2 * PI() * EXTRACT(hour FROM timestamp) / 24 - PI()/2) + RANDOM() * 20 AS requests_per_min,
|
||||
-- Random response time between 50-500ms
|
||||
FLOOR(RANDOM() * 450 + 50)::INTEGER AS avg_response_ms,
|
||||
-- Error rate 0-5%
|
||||
RANDOM() * 5 AS error_rate,
|
||||
-- Random CPU usage
|
||||
FLOOR(RANDOM() * 60 + 20)::INTEGER AS cpu_usage_pct
|
||||
FROM GENERATE_SERIES(
|
||||
TIMESTAMP '2024-01-01 00:00:00',
|
||||
TIMESTAMP '2024-12-31 23:59:59',
|
||||
INTERVAL '1 hour'
|
||||
) AS t(timestamp);
|
||||
```
|
||||
|
||||
### Pattern 4: Relational Data with Foreign Keys
|
||||
|
||||
```sql
|
||||
-- Create related tables: Users → Orders → Order Items
|
||||
|
||||
-- Users
|
||||
CREATE TABLE users AS
|
||||
SELECT
|
||||
user_id,
|
||||
'user' || user_id || '@example.com' AS email,
|
||||
DATE '2020-01-01' + (RANDOM() * 1460)::INTEGER * INTERVAL '1 day' AS created_at
|
||||
FROM GENERATE_SERIES(1, 100) AS t(user_id);
|
||||
|
||||
-- Orders
|
||||
CREATE TABLE orders AS
|
||||
SELECT
|
||||
order_id,
|
||||
FLOOR(RANDOM() * 100 + 1)::INTEGER AS user_id, -- FK to users
|
||||
order_date,
|
||||
LIST_ELEMENT(['pending', 'shipped', 'delivered'], FLOOR(RANDOM() * 3 + 1)::INTEGER) AS status
|
||||
FROM GENERATE_SERIES(1, 500) AS t(order_id)
|
||||
CROSS JOIN (
|
||||
SELECT DATE '2024-01-01' + (RANDOM() * 365)::INTEGER * INTERVAL '1 day' AS order_date
|
||||
) AS dates;
|
||||
|
||||
-- Order Items
|
||||
CREATE TABLE order_items AS
|
||||
SELECT
|
||||
ROW_NUMBER() OVER () AS item_id,
|
||||
order_id,
|
||||
'PRODUCT_' || FLOOR(RANDOM() * 50 + 1)::INTEGER AS product_id,
|
||||
FLOOR(RANDOM() * 5 + 1)::INTEGER AS quantity,
|
||||
FLOOR(RANDOM() * 20000 + 500)::INTEGER / 100.0 AS price
|
||||
FROM orders
|
||||
CROSS JOIN GENERATE_SERIES(1, FLOOR(RANDOM() * 5 + 1)::INTEGER) AS t(n);
|
||||
```
|
||||
|
||||
### Pattern 5: Geographic Data
|
||||
|
||||
```sql
|
||||
-- Generate synthetic locations
|
||||
CREATE TABLE locations AS
|
||||
SELECT
|
||||
location_id,
|
||||
LIST_ELEMENT(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], FLOOR(RANDOM() * 5 + 1)::INTEGER) AS city,
|
||||
LIST_ELEMENT(['NY', 'CA', 'IL', 'TX', 'AZ'], FLOOR(RANDOM() * 5 + 1)::INTEGER) AS state,
|
||||
-- Random US ZIP code
|
||||
LPAD(FLOOR(RANDOM() * 99999)::INTEGER::VARCHAR, 5, '0') AS zip_code,
|
||||
-- Random coordinates (simplified for demo)
|
||||
ROUND((RANDOM() * 50 + 25)::DECIMAL, 6) AS latitude,
|
||||
ROUND((RANDOM() * 60 - 125)::DECIMAL, 6) AS longitude
|
||||
FROM GENERATE_SERIES(1, 200) AS t(location_id);
|
||||
```
|
||||
|
||||
## MXCP Integration Patterns
|
||||
|
||||
### Pattern 1: dbt Model for Synthetic Data
|
||||
|
||||
**Use case**: Generate test data that persists across runs
|
||||
|
||||
```sql
|
||||
-- models/synthetic_customers.sql
|
||||
{{ config(materialized='table') }}
|
||||
|
||||
WITH name_options AS (
|
||||
SELECT unnest(['John', 'Jane', 'Alice', 'Bob', 'Charlie']) AS first_name
|
||||
), surname_options AS (
|
||||
SELECT unnest(['Smith', 'Doe', 'Johnson', 'Brown']) AS last_name
|
||||
)
|
||||
SELECT
|
||||
ROW_NUMBER() OVER () AS customer_id,
|
||||
first_name || ' ' || last_name AS full_name,
|
||||
LOWER(first_name) || '.' || LOWER(last_name) || '@example.com' AS email,
|
||||
DATE '2020-01-01' + (RANDOM() * 1000)::INTEGER * INTERVAL '1 day' AS signup_date
|
||||
FROM name_options
|
||||
CROSS JOIN surname_options
|
||||
CROSS JOIN GENERATE_SERIES(1, 50) -- 5 * 4 * 50 = 1000 customers
|
||||
```
|
||||
|
||||
```yaml
|
||||
# models/schema.yml
|
||||
version: 2
|
||||
|
||||
models:
|
||||
- name: synthetic_customers
|
||||
description: "Synthetic customer data for testing"
|
||||
columns:
|
||||
- name: customer_id
|
||||
tests: [unique, not_null]
|
||||
- name: email
|
||||
tests: [unique, not_null]
|
||||
```
|
||||
|
||||
**Build and query**:
|
||||
```bash
|
||||
dbt run --select synthetic_customers
|
||||
```
|
||||
|
||||
```yaml
|
||||
# tools/query_test_customers.yml
|
||||
mxcp: 1
|
||||
tool:
|
||||
name: query_test_customers
|
||||
description: "Query synthetic customer data"
|
||||
return:
|
||||
type: array
|
||||
source:
|
||||
code: |
|
||||
SELECT * FROM synthetic_customers LIMIT 100
|
||||
```
|
||||
|
||||
### Pattern 2: Python Tool for Dynamic Generation
|
||||
|
||||
**Use case**: Generate data on-the-fly based on parameters
|
||||
|
||||
```python
|
||||
# python/data_generator.py
|
||||
from mxcp.runtime import db
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
import random
|
||||
|
||||
def generate_transactions(
|
||||
count: int = 100,
|
||||
start_date: str = "2024-01-01",
|
||||
end_date: str = "2024-12-31"
|
||||
) -> dict:
|
||||
"""Generate synthetic transaction data"""
|
||||
|
||||
# Create temporary table
|
||||
table_name = f"temp_transactions_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Parse dates
|
||||
start = datetime.fromisoformat(start_date)
|
||||
end = datetime.fromisoformat(end_date)
|
||||
date_range = (end - start).days
|
||||
|
||||
db.execute(f"""
|
||||
CREATE TABLE {table_name} AS
|
||||
SELECT
|
||||
ROW_NUMBER() OVER () AS id,
|
||||
DATE '{start_date}' + (RANDOM() * {date_range})::INTEGER * INTERVAL '1 day' AS transaction_date,
|
||||
FLOOR(RANDOM() * 100000 + 1000)::INTEGER / 100.0 AS amount,
|
||||
LIST_ELEMENT(['completed', 'pending', 'failed'], FLOOR(RANDOM() * 10 + 1)::INTEGER) AS status
|
||||
FROM GENERATE_SERIES(1, {count})
|
||||
""")
|
||||
|
||||
# Get sample
|
||||
sample = db.execute(f"SELECT * FROM {table_name} LIMIT 10").fetchall()
|
||||
|
||||
return {
|
||||
"table_name": table_name,
|
||||
"rows_generated": count,
|
||||
"sample": sample,
|
||||
"query_hint": f"SELECT * FROM {table_name}"
|
||||
}
|
||||
|
||||
def generate_customers(count: int = 100) -> dict:
|
||||
"""Generate synthetic customer records"""
|
||||
|
||||
table_name = f"temp_customers_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
first_names = ['John', 'Jane', 'Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank']
|
||||
last_names = ['Smith', 'Doe', 'Johnson', 'Williams', 'Brown', 'Jones', 'Miller']
|
||||
tiers = ['bronze', 'silver', 'gold', 'platinum']
|
||||
|
||||
db.execute(f"""
|
||||
CREATE TABLE {table_name} AS
|
||||
WITH names AS (
|
||||
SELECT
|
||||
unnest({first_names}) AS first_name,
|
||||
unnest({last_names}) AS last_name
|
||||
)
|
||||
SELECT
|
||||
ROW_NUMBER() OVER () AS customer_id,
|
||||
first_name || ' ' || last_name AS full_name,
|
||||
LOWER(first_name) || '.' || LOWER(last_name) || FLOOR(RANDOM() * 1000)::INTEGER || '@example.com' AS email,
|
||||
LIST_ELEMENT({tiers}, FLOOR(RANDOM() * {len(tiers)} + 1)::INTEGER) AS tier,
|
||||
DATE '2020-01-01' + (RANDOM() * 1460)::INTEGER * INTERVAL '1 day' AS created_at
|
||||
FROM names
|
||||
CROSS JOIN GENERATE_SERIES(1, CEIL({count} / (SELECT COUNT(*) FROM names))::INTEGER)
|
||||
LIMIT {count}
|
||||
""")
|
||||
|
||||
stats = db.execute(f"""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(DISTINCT tier) as tiers,
|
||||
MIN(created_at) as earliest,
|
||||
MAX(created_at) as latest
|
||||
FROM {table_name}
|
||||
""").fetchone()
|
||||
|
||||
return {
|
||||
"table_name": table_name,
|
||||
"rows_generated": stats["total"],
|
||||
"statistics": dict(stats),
|
||||
"query_hint": f"SELECT * FROM {table_name}"
|
||||
}
|
||||
```
|
||||
|
||||
```yaml
|
||||
# tools/generate_test_data.yml
|
||||
mxcp: 1
|
||||
tool:
|
||||
name: generate_test_data
|
||||
description: "Generate synthetic data for testing"
|
||||
language: python
|
||||
parameters:
|
||||
- name: data_type
|
||||
type: string
|
||||
examples: ["transactions", "customers"]
|
||||
- name: count
|
||||
type: integer
|
||||
default: 100
|
||||
return:
|
||||
type: object
|
||||
source:
|
||||
file: ../python/data_generator.py
|
||||
function: |
|
||||
if data_type == "transactions":
|
||||
return generate_transactions(count)
|
||||
elif data_type == "customers":
|
||||
return generate_customers(count)
|
||||
else:
|
||||
raise ValueError(f"Unknown data_type: {data_type}")
|
||||
```
|
||||
|
||||
### Pattern 3: Statistics Tool for Synthetic Data
|
||||
|
||||
**Use case**: Generate data and immediately calculate statistics
|
||||
|
||||
```yaml
|
||||
# tools/synthetic_analytics.yml
|
||||
mxcp: 1
|
||||
tool:
|
||||
name: synthetic_analytics
|
||||
description: "Generate synthetic sales data and calculate statistics"
|
||||
language: python
|
||||
parameters:
|
||||
- name: days
|
||||
type: integer
|
||||
default: 365
|
||||
- name: transactions_per_day
|
||||
type: integer
|
||||
default: 100
|
||||
return:
|
||||
type: object
|
||||
properties:
|
||||
daily_stats: { type: array }
|
||||
overall_stats: { type: object }
|
||||
source:
|
||||
code: |
|
||||
from mxcp.runtime import db
|
||||
|
||||
total = days * transactions_per_day
|
||||
|
||||
# Generate data
|
||||
db.execute(f"""
|
||||
CREATE OR REPLACE TEMP TABLE temp_sales AS
|
||||
SELECT
|
||||
DATE '2024-01-01' + (RANDOM() * {days})::INTEGER * INTERVAL '1 day' AS sale_date,
|
||||
FLOOR(RANDOM() * 50000 + 1000)::INTEGER / 100.0 AS amount,
|
||||
LIST_ELEMENT(['online', 'retail', 'wholesale'], FLOOR(RANDOM() * 3 + 1)::INTEGER) AS channel
|
||||
FROM GENERATE_SERIES(1, {total})
|
||||
""")
|
||||
|
||||
# Calculate statistics
|
||||
daily_stats = db.execute("""
|
||||
SELECT
|
||||
sale_date,
|
||||
COUNT(*) as transactions,
|
||||
SUM(amount) as total_sales,
|
||||
AVG(amount) as avg_sale
|
||||
FROM temp_sales
|
||||
GROUP BY sale_date
|
||||
ORDER BY sale_date
|
||||
""").fetchall()
|
||||
|
||||
overall = db.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total_transactions,
|
||||
SUM(amount) as total_revenue,
|
||||
AVG(amount) as avg_transaction,
|
||||
MIN(amount) as min_transaction,
|
||||
MAX(amount) as max_transaction,
|
||||
STDDEV(amount) as std_dev
|
||||
FROM temp_sales
|
||||
""").fetchone()
|
||||
|
||||
return {
|
||||
"daily_stats": daily_stats,
|
||||
"overall_stats": dict(overall)
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced Patterns
|
||||
|
||||
### Realistic Distributions
|
||||
|
||||
**Normal distribution** (for things like heights, test scores):
|
||||
```sql
|
||||
-- Box-Muller transform for normal distribution
|
||||
SELECT
|
||||
SQRT(-2 * LN(RANDOM())) * COS(2 * PI() * RANDOM()) * 15 + 100 AS iq_score
|
||||
FROM GENERATE_SERIES(1, 1000)
|
||||
```
|
||||
|
||||
**Power law distribution** (for things like city populations):
|
||||
```sql
|
||||
SELECT
|
||||
FLOOR(POWER(RANDOM(), -0.5) * 1000)::INTEGER AS followers
|
||||
FROM GENERATE_SERIES(1, 1000)
|
||||
```
|
||||
|
||||
**Seasonal patterns**:
|
||||
```sql
|
||||
-- Sales with seasonal pattern (peak in Dec, low in Feb)
|
||||
SELECT
|
||||
date,
|
||||
-- Base level + seasonal component + random noise
|
||||
1000 + 500 * SIN(2 * PI() * EXTRACT(month FROM date) / 12 - PI()/2) + RANDOM() * 200 AS daily_sales
|
||||
FROM GENERATE_SERIES(DATE '2024-01-01', DATE '2024-12-31', INTERVAL '1 day') AS t(date)
|
||||
```
|
||||
|
||||
### Data Masking/Anonymization
|
||||
|
||||
**Replace real data with synthetic**:
|
||||
```sql
|
||||
-- Anonymize customer data
|
||||
CREATE TABLE customers_anonymized AS
|
||||
SELECT
|
||||
customer_id, -- Keep ID for joins
|
||||
'USER_' || customer_id || '@example.com' AS email, -- Fake email
|
||||
LIST_ELEMENT(['John', 'Jane', 'Alice', 'Bob'], (customer_id % 4) + 1) AS first_name, -- Fake name
|
||||
LEFT(phone, 3) || '-XXX-XXXX' AS masked_phone, -- Mask phone
|
||||
FLOOR(age / 10) * 10 AS age_bucket -- Generalize age
|
||||
FROM customers_real;
|
||||
```
|
||||
|
||||
## Complete Example: Synthetic Analytics Server
|
||||
|
||||
**Scenario**: Demo server with synthetic e-commerce data
|
||||
|
||||
```bash
|
||||
# Project structure
|
||||
synthetic-analytics/
|
||||
├── mxcp-site.yml
|
||||
├── models/
|
||||
│ ├── synthetic_customers.sql
|
||||
│ ├── synthetic_orders.sql
|
||||
│ └── schema.yml
|
||||
├── python/
|
||||
│ └── generators.py
|
||||
└── tools/
|
||||
├── generate_data.yml
|
||||
├── customer_analytics.yml
|
||||
└── sales_trends.yml
|
||||
```
|
||||
|
||||
```sql
|
||||
-- models/synthetic_customers.sql
|
||||
{{ config(materialized='table') }}
|
||||
|
||||
SELECT
|
||||
customer_id,
|
||||
'customer' || customer_id || '@example.com' AS email,
|
||||
LIST_ELEMENT(['bronze', 'silver', 'gold'], (customer_id % 3) + 1) AS tier,
|
||||
DATE '2020-01-01' + (RANDOM() * 1000)::INTEGER * INTERVAL '1 day' AS signup_date
|
||||
FROM GENERATE_SERIES(1, 500) AS t(customer_id)
|
||||
```
|
||||
|
||||
```sql
|
||||
-- models/synthetic_orders.sql
|
||||
{{ config(materialized='table') }}
|
||||
|
||||
SELECT
|
||||
order_id,
|
||||
FLOOR(RANDOM() * 500 + 1)::INTEGER AS customer_id,
|
||||
order_date,
|
||||
FLOOR(RANDOM() * 100000 + 1000)::INTEGER / 100.0 AS amount,
|
||||
LIST_ELEMENT(['completed', 'shipped', 'pending'], FLOOR(RANDOM() * 3 + 1)::INTEGER) AS status
|
||||
FROM GENERATE_SERIES(1, 5000) AS t(order_id)
|
||||
CROSS JOIN (
|
||||
SELECT DATE '2024-01-01' + (RANDOM() * 365)::INTEGER * INTERVAL '1 day' AS order_date
|
||||
) AS dates
|
||||
```
|
||||
|
||||
```yaml
|
||||
# tools/customer_analytics.yml
|
||||
mxcp: 1
|
||||
tool:
|
||||
name: customer_analytics
|
||||
description: "Get customer analytics from synthetic data"
|
||||
parameters:
|
||||
- name: tier
|
||||
type: string
|
||||
required: false
|
||||
return:
|
||||
type: array
|
||||
source:
|
||||
code: |
|
||||
SELECT
|
||||
c.tier,
|
||||
COUNT(DISTINCT c.customer_id) as customers,
|
||||
COUNT(o.order_id) as total_orders,
|
||||
SUM(o.amount) as total_revenue,
|
||||
AVG(o.amount) as avg_order_value
|
||||
FROM synthetic_customers c
|
||||
LEFT JOIN synthetic_orders o ON c.customer_id = o.customer_id
|
||||
WHERE $tier IS NULL OR c.tier = $tier
|
||||
GROUP BY c.tier
|
||||
ORDER BY total_revenue DESC
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use dbt for persistent data**: Synthetic data that should be consistent across queries
|
||||
2. **Use Python for dynamic data**: Data that changes based on parameters
|
||||
3. **Seed random number generator**: For reproducible results, use `SETSEED()` in DuckDB
|
||||
4. **Realistic distributions**: Use appropriate statistical distributions
|
||||
5. **Maintain referential integrity**: Ensure foreign keys match
|
||||
6. **Add noise**: Real data isn't perfectly distributed, add randomness
|
||||
7. **Document data generation**: Explain how synthetic data was created
|
||||
8. **Test with synthetic first**: Validate tools before using real data
|
||||
|
||||
## Summary
|
||||
|
||||
For synthetic data in MXCP:
|
||||
|
||||
1. **DuckDB patterns**: `GENERATE_SERIES`, `RANDOM()`, `LIST_ELEMENT()`, `UUID()`
|
||||
2. **dbt models**: For persistent, version-controlled synthetic data
|
||||
3. **Python tools**: For dynamic generation based on parameters
|
||||
4. **Statistics**: Generate data → calculate metrics in one tool
|
||||
5. **Testing**: Use synthetic data to test tools before real data
|
||||
6. **Privacy**: Anonymize real data by generating synthetic replacements
|
||||
Reference in New Issue
Block a user