Initial commit

2025-11-30 08:38:26 +08:00
commit 41d9f6b189
304 changed files with 98322 additions and 0 deletions
--- a/skills/code-data-analysis-scaffolds/resources/evaluators/rubric_code_data_analysis_scaffolds.json
+++ b/skills/code-data-analysis-scaffolds/resources/evaluators/rubric_code_data_analysis_scaffolds.json
@@ -0,0 +1,314 @@
+{
+  "criteria": [
+    {
+      "name": "Scaffold Structure Clarity",
+      "description": "Is the scaffold structure clear, systematic, and easy to follow?",
+      "scoring": {
+        "1": "No clear structure. Random collection of steps/checks without logical flow.",
+        "2": "Basic structure but steps are vague or out of order. User confused about what to do next.",
+        "3": "Clear structure with defined steps. User can follow but may need clarification on some steps.",
+        "4": "Well-organized structure with clear steps, checkpoints, and expected outputs at each stage.",
+        "5": "Exemplary structure: systematic, numbered steps with clear inputs/outputs, decision points explicit."
+      },
+      "red_flags": [
+        "Steps not numbered or sequenced",
+        "No clear starting/ending point",
+        "Validation steps missing",
+        "User must guess what to do next"
+      ]
+    },
+    {
+      "name": "Coverage Completeness",
+      "description": "Does the scaffold cover all necessary aspects (happy path, edge cases, validation, etc.)?",
+      "scoring": {
+        "1": "Major gaps. Only covers happy path, ignores edge cases/errors/validation.",
+        "2": "Partial coverage. Addresses main case but misses important edge cases or validation steps.",
+        "3": "Adequate coverage. Main cases and some edge cases covered. Basic validation included.",
+        "4": "Comprehensive coverage. Happy path, edge cases, error conditions, validation all included.",
+        "5": "Exhaustive coverage. All cases, validation at each step, robustness checks, limitations documented."
+      },
+      "red_flags": [
+        "TDD scaffold: No tests for edge cases or errors",
+        "EDA scaffold: Missing data quality checks",
+        "Statistical scaffold: No assumption checks",
+        "Any scaffold: No validation step before delivering"
+      ]
+    },
+    {
+      "name": "Technical Rigor",
+      "description": "Is the approach technically sound with appropriate methods/tests?",
+      "scoring": {
+        "1": "Technically incorrect. Wrong methods, flawed logic, or inappropriate techniques.",
+        "2": "Questionable rigor. Some techniques correct but others questionable or missing justification.",
+        "3": "Adequate rigor. Standard techniques applied correctly. Acceptable for routine work.",
+        "4": "High rigor. Appropriate methods, assumptions checked, sensitivity analysis included.",
+        "5": "Exemplary rigor. Best practices followed, multiple validation approaches, limitations acknowledged."
+      },
+      "red_flags": [
+        "Causal inference without DAG or identification strategy",
+        "Statistical test without checking assumptions",
+        "ML model without train/val/test split (data leakage)",
+        "TDD without testing error conditions"
+      ]
+    },
+    {
+      "name": "Actionability",
+      "description": "Can user execute scaffold without further guidance? Are examples concrete?",
+      "scoring": {
+        "1": "Not actionable. Vague advice, no concrete steps, no code examples.",
+        "2": "Somewhat actionable. General direction but user needs to figure out details.",
+        "3": "Actionable. Clear steps with code snippets. User can execute with minor adjustments.",
+        "4": "Highly actionable. Complete code examples, data assumptions stated, ready to adapt.",
+        "5": "Immediately executable. Copy-paste ready examples with inline comments, expected outputs shown."
+      },
+      "red_flags": [
+        "No code examples (just prose descriptions)",
+        "Code has placeholders without explaining what to fill in",
+        "No example inputs/outputs",
+        "Vague instructions ('check assumptions', 'validate results' without saying how)"
+      ]
+    },
+    {
+      "name": "Test Quality (for TDD)",
+      "description": "For TDD scaffolds: Do tests cover happy path, edge cases, errors, and integration?",
+      "scoring": {
+        "1": "Only happy path tests. No edge cases, errors, or integration tests.",
+        "2": "Happy path + some edge cases. Error handling or integration missing.",
+        "3": "Happy path, edge cases, basic error tests. Integration tests may be missing.",
+        "4": "Comprehensive: Happy path, edge cases, error conditions, integration tests all present.",
+        "5": "Exemplary: Above + property-based tests, test fixtures, mocks for external dependencies."
+      },
+      "red_flags": [
+        "No tests for None/empty input",
+        "No tests for expected exceptions",
+        "No tests for state changes/side effects",
+        "No integration tests for external systems"
+      ],
+      "applicable_to": ["TDD"]
+    },
+    {
+      "name": "Data Quality Assessment (for EDA)",
+      "description": "For EDA scaffolds: Are data quality checks (missing, duplicates, outliers, consistency) included?",
+      "scoring": {
+        "1": "No data quality checks. Jumps straight to analysis without inspecting data.",
+        "2": "Minimal checks. Maybe checks missing values but ignores duplicates, outliers, consistency.",
+        "3": "Basic quality checks. Missing values, duplicates, basic outliers checked.",
+        "4": "Thorough quality checks. Missing patterns, duplicates, outliers, type consistency, referential integrity.",
+        "5": "Comprehensive quality framework. All checks + distributions, cardinality, data lineage, validation rules."
+      },
+      "red_flags": [
+        "No check for missing values",
+        "No check for duplicates",
+        "No outlier detection",
+        "Assumes data is clean without validation"
+      ],
+      "applicable_to": ["EDA", "Statistical Analysis", "Predictive Modeling"]
+    },
+    {
+      "name": "Assumption Documentation",
+      "description": "Are assumptions explicitly stated and justified?",
+      "scoring": {
+        "1": "No assumptions stated. User unaware of what's being assumed.",
+        "2": "Some assumptions implicit but not documented. User must infer them.",
+        "3": "Key assumptions stated but not justified or validated.",
+        "4": "Assumptions explicitly stated with justification. User knows what's assumed and why.",
+        "5": "Assumptions stated, justified, validated where possible, and sensitivity to violations analyzed."
+      },
+      "red_flags": [
+        "Statistical test applied without stating/checking assumptions",
+        "Causal claim without stating identification assumptions",
+        "ML model without documenting train/test split assumptions",
+        "Function implementation without stating preconditions"
+      ]
+    },
+    {
+      "name": "Validation Steps Included",
+      "description": "Does scaffold include validation/quality checks before delivering results?",
+      "scoring": {
+        "1": "No validation. Results delivered without any quality checks.",
+        "2": "Informal validation. 'Looks good' without systematic checks.",
+        "3": "Basic validation. Some checks but not comprehensive or systematic.",
+        "4": "Systematic validation. Checklist of quality criteria, most items checked.",
+        "5": "Rigorous validation framework. Multiple validation approaches, robustness checks, edge cases tested."
+      },
+      "red_flags": [
+        "No validation step in workflow",
+        "No rubric or checklist to assess quality",
+        "No test suite execution before delivering code",
+        "No sensitivity analysis for statistical results"
+      ]
+    },
+    {
+      "name": "Code/Analysis Quality",
+      "description": "Is code well-structured, readable, and following best practices?",
+      "scoring": {
+        "1": "Poor quality. Spaghetti code, no structure, hard to understand.",
+        "2": "Low quality. Works but hard to read, poor naming, no comments.",
+        "3": "Adequate quality. Readable, basic structure, some comments. Acceptable for prototypes.",
+        "4": "Good quality. Clean code, good naming, appropriate comments, follows style guide.",
+        "5": "Excellent quality. Modular, DRY, well-documented, type hints, follows SOLID principles."
+      },
+      "red_flags": [
+        "Magic numbers without explanation",
+        "Copy-pasted code (not DRY)",
+        "Functions doing multiple unrelated things",
+        "No docstrings or comments explaining complex logic"
+      ]
+    },
+    {
+      "name": "Reproducibility",
+      "description": "Can another person reproduce the analysis/tests with provided information?",
+      "scoring": {
+        "1": "Not reproducible. Missing critical information (data, packages, random seeds).",
+        "2": "Partially reproducible. Some information provided but key details missing.",
+        "3": "Mostly reproducible. Enough information for skilled practitioner to reproduce with effort.",
+        "4": "Reproducible. All information provided (data access, package versions, random seeds, parameters).",
+        "5": "Fully reproducible. Documented environment, requirements.txt, Docker container, or notebook with all steps."
+      },
+      "red_flags": [
+        "No package versions specified",
+        "Random operations without setting seed",
+        "Data source not documented or inaccessible",
+        "No instructions for running tests/analysis"
+      ]
+    }
+  ],
+  "task_type_guidance": {
+    "TDD": {
+      "description": "Test-Driven Development scaffolds",
+      "focus_criteria": [
+        "Test Quality",
+        "Code/Analysis Quality",
+        "Validation Steps Included"
+      ],
+      "target_score": 3.5,
+      "success_indicators": [
+        "Tests written before implementation",
+        "Happy path, edge cases, errors all tested",
+        "Tests pass and are maintainable",
+        "Red-Green-Refactor cycle followed"
+      ]
+    },
+    "EDA": {
+      "description": "Exploratory Data Analysis scaffolds",
+      "focus_criteria": [
+        "Data Quality Assessment",
+        "Coverage Completeness",
+        "Assumption Documentation"
+      ],
+      "target_score": 3.5,
+      "success_indicators": [
+        "Data quality systematically checked",
+        "Univariate and bivariate analysis completed",
+        "Insights and recommendations documented",
+        "Missing values, outliers, distributions analyzed"
+      ]
+    },
+    "Statistical Analysis": {
+      "description": "Hypothesis testing, A/B tests, causal inference",
+      "focus_criteria": [
+        "Technical Rigor",
+        "Assumption Documentation",
+        "Validation Steps Included"
+      ],
+      "target_score": 4.0,
+      "success_indicators": [
+        "Hypotheses clearly stated",
+        "Appropriate test selected and justified",
+        "Assumptions checked (normality, independence, etc.)",
+        "Effect sizes and confidence intervals reported",
+        "Sensitivity analysis performed"
+      ]
+    },
+    "Predictive Modeling": {
+      "description": "ML model building and evaluation",
+      "focus_criteria": [
+        "Technical Rigor",
+        "Validation Steps Included",
+        "Reproducibility"
+      ],
+      "target_score": 4.0,
+      "success_indicators": [
+        "Train/val/test split before preprocessing (no data leakage)",
+        "Baseline model for comparison",
+        "Cross-validation performed",
+        "Error analysis and feature importance computed",
+        "Model deployment checklist completed"
+      ]
+    },
+    "Validation": {
+      "description": "Data/code/model quality checks",
+      "focus_criteria": [
+        "Coverage Completeness",
+        "Validation Steps Included",
+        "Technical Rigor"
+      ],
+      "target_score": 4.0,
+      "success_indicators": [
+        "Schema validation (types, ranges, constraints)",
+        "Referential integrity checked",
+        "Edge cases tested",
+        "Monitoring/alerting strategy defined"
+      ]
+    }
+  },
+  "common_failure_modes": [
+    {
+      "failure_mode": "Jumping to Implementation Without Scaffold",
+      "symptoms": "User writes code/analysis immediately without planning structure first.",
+      "consequences": "Missing edge cases, poor test coverage, incomplete analysis.",
+      "fix": "Force scaffold creation before implementation. Use template as checklist."
+    },
+    {
+      "failure_mode": "Testing Only Happy Path",
+      "symptoms": "TDD scaffold has tests for expected usage but none for errors/edge cases.",
+      "consequences": "Code breaks in production on unexpected inputs.",
+      "fix": "Require tests for: empty input, None, boundary values, invalid types, expected exceptions."
+    },
+    {
+      "failure_mode": "Skipping Data Quality Checks",
+      "symptoms": "EDA scaffold jumps to visualization without checking missing values, outliers, duplicates.",
+      "consequences": "Invalid conclusions based on dirty data.",
+      "fix": "Mandatory data quality section before any analysis. No exceptions."
+    },
+    {
+      "failure_mode": "Assumptions Not Documented",
+      "symptoms": "Statistical test applied without stating/checking assumptions (normality, independence, etc.).",
+      "consequences": "Invalid statistical inference. Wrong conclusions.",
+      "fix": "Explicit assumption section in scaffold. Check assumptions before applying test."
+    },
+    {
+      "failure_mode": "No Validation Step",
+      "symptoms": "Scaffold delivers results without any quality check or self-assessment.",
+      "consequences": "Low-quality outputs, errors not caught.",
+      "fix": "Mandatory validation step in workflow. Use rubric self-assessment."
+    },
+    {
+      "failure_mode": "Correlation Interpreted as Causation",
+      "symptoms": "EDA finds correlation, claims causal relationship without causal inference methods.",
+      "consequences": "Wrong business decisions based on spurious causality.",
+      "fix": "Distinguish predictive (correlation) from causal questions. Use causal inference methodology if claiming causation."
+    },
+    {
+      "failure_mode": "Data Leakage in ML",
+      "symptoms": "Preprocessing (scaling, imputation) done before train/test split.",
+      "consequences": "Overly optimistic model performance. Fails in production.",
+      "fix": "Scaffold enforces: split first, then preprocess. Fit transformers on train only."
+    },
+    {
+      "failure_mode": "Code Without Tests",
+      "symptoms": "Implementation provided but no test scaffold or test execution.",
+      "consequences": "Regressions not caught, bugs in production.",
+      "fix": "TDD scaffold mandatory for production code. Tests must pass before code review."
+    }
+  ],
+  "scale": 5,
+  "minimum_average_score": 3.5,
+  "interpretation": {
+    "1.0-2.0": "Inadequate. Major gaps in structure, coverage, or rigor. Do not use. Revise scaffold.",
+    "2.0-3.0": "Needs improvement. Basic structure present but incomplete or lacks rigor. Acceptable for learning/practice only.",
+    "3.0-3.5": "Acceptable. Covers main cases with adequate rigor. Suitable for routine work or prototypes.",
+    "3.5-4.0": "Good. Comprehensive coverage with good rigor. Suitable for production code/analysis.",
+    "4.0-5.0": "Excellent. Exemplary structure, rigor, and completeness. Production-ready with best practices."
+  }
+}
--- a/skills/code-data-analysis-scaffolds/resources/examples/eda-customer-churn.md
+++ b/skills/code-data-analysis-scaffolds/resources/examples/eda-customer-churn.md
@@ -0,0 +1,272 @@
+# EDA Example: Customer Churn Analysis
+
+Complete exploratory data analysis for telecom customer churn dataset.
+
+## Task
+
+Explore customer churn dataset to understand:
+- What factors correlate with churn?
+- Are there data quality issues?
+- What features should we engineer for predictive model?
+
+## Dataset
+
+- **Rows**: 7,043 customers
+- **Target**: `Churn` (Yes/No)
+- **Features**: 20 columns (demographics, account info, usage patterns)
+
+## EDA Scaffold Applied
+
+### 1. Data Overview
+
+```python
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+df = pd.read_csv('telecom_churn.csv')
+
+print(f"Shape: {df.shape}")
+# Output: (7043, 21)
+
+print(f"Columns: {df.columns.tolist()}")
+# ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
+#  'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
+#  'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
+#  'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
+#  'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
+
+print(df.dtypes)
+# customerID        object
+# gender            object
+# SeniorCitizen      int64
+# tenure             int64
+# MonthlyCharges   float64
+# TotalCharges      object  ← Should be numeric!
+# Churn             object
+
+print(df.head())
+print(df.describe())
+```
+
+**Findings**:
+- TotalCharges is object type (should be numeric) - needs fixing
+- Churn is target variable (26.5% churn rate)
+
+### 2. Data Quality Checks
+
+```python
+# Missing values
+missing = df.isnull().sum()
+missing_pct = (missing / len(df)) * 100
+print(missing_pct[missing_pct > 0])
+# No missing values marked as NaN
+
+# But TotalCharges is object - check for empty strings
+print((df['TotalCharges'] == ' ').sum())
+# Output: 11 rows have space instead of number
+
+# Fix: Convert TotalCharges to numeric
+df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
+print(df['TotalCharges'].isnull().sum())
+# Output: 11 (now properly marked as missing)
+
+# Strategy: Drop 11 rows (< 0.2% of data)
+df = df.dropna()
+
+# Duplicates
+print(f"Duplicates: {df.duplicated().sum()}")
+# Output: 0
+
+# Data consistency checks
+print("Tenure vs TotalCharges consistency:")
+print(df[['tenure', 'MonthlyCharges', 'TotalCharges']].head())
+# tenure=1, Monthly=$29, Total=$29 ✓
+# tenure=34, Monthly=$57, Total=$1889 ≈ $57*34 ✓
+```
+
+**Findings**:
+- 11 rows (0.16%) with missing TotalCharges - dropped
+- No duplicates
+- TotalCharges ≈ MonthlyCharges × tenure (consistent)
+
+### 3. Univariate Analysis
+
+```python
+# Target variable
+print(df['Churn'].value_counts(normalize=True))
+# No     73.5%
+# Yes    26.5%
+
+# Imbalanced but not severely (>20% minority class is workable)
+
+# Numeric variables
+numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
+for col in numeric_cols:
+    print(f"\n{col}:")
+    print(f"  Mean: {df[col].mean():.2f}, Median: {df[col].median():.2f}")
+    print(f"  Std: {df[col].std():.2f}, Range: [{df[col].min()}, {df[col].max()}]")
+
+    # Histogram
+    df[col].hist(bins=50, edgecolor='black')
+    plt.title(f'{col} Distribution')
+    plt.xlabel(col)
+    plt.show()
+
+    # Check outliers
+    Q1, Q3 = df[col].quantile([0.25, 0.75])
+    IQR = Q3 - Q1
+    outliers = ((df[col] < (Q1 - 1.5*IQR)) | (df[col] > (Q3 + 1.5*IQR))).sum()
+    print(f"  Outliers: {outliers} ({outliers/len(df)*100:.1f}%)")
+```
+
+**Findings**:
+- **tenure**: Right-skewed (mean=32, median=29). Many new customers (0-12 months).
+- **MonthlyCharges**: Bimodal distribution (peaks at ~$20 and ~$80). Suggests customer segments.
+- **TotalCharges**: Right-skewed (correlated with tenure). Few outliers (2.3%).
+
+```python
+# Categorical variables
+cat_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Contract', 'PaymentMethod']
+for col in cat_cols:
+    print(f"\n{col}: {df[col].nunique()} unique values")
+    print(df[col].value_counts())
+
+    # Bar plot
+    df[col].value_counts().plot(kind='bar')
+    plt.title(f'{col} Distribution')
+    plt.xticks(rotation=45)
+    plt.show()
+```
+
+**Findings**:
+- **gender**: Balanced (50/50 male/female)
+- **SeniorCitizen**: 16% are senior citizens
+- **Contract**: 55% month-to-month, 24% one-year, 21% two-year
+- **PaymentMethod**: Electronic check most common (34%)
+
+### 4. Bivariate Analysis (Churn vs Features)
+
+```python
+# Churn rate by categorical variables
+for col in cat_cols:
+    churn_rate = df.groupby(col)['Churn'].apply(lambda x: (x=='Yes').mean())
+    print(f"\n{col} vs Churn:")
+    print(churn_rate.sort_values(ascending=False))
+
+    # Stacked bar chart
+    pd.crosstab(df[col], df['Churn'], normalize='index').plot(kind='bar', stacked=True)
+    plt.title(f'Churn Rate by {col}')
+    plt.ylabel('Proportion')
+    plt.show()
+```
+
+**Key Findings**:
+- **Contract**: Month-to-month churn=42.7%, One-year=11.3%, Two-year=2.8% (Strong signal!)
+- **SeniorCitizen**: Seniors churn=41.7%, Non-seniors=23.6%
+- **PaymentMethod**: Electronic check=45.3% churn, others~15-18%
+- **tenure**: Customers with tenure<12 months churn=47.5%, >60 months=7.9%
+
+```python
+# Numeric variables vs Churn
+for col in numeric_cols:
+    plt.figure(figsize=(10, 4))
+
+    # Box plot
+    plt.subplot(1, 2, 1)
+    df.boxplot(column=col, by='Churn')
+    plt.title(f'{col} by Churn')
+
+    # Histogram (overlay)
+    plt.subplot(1, 2, 2)
+    df[df['Churn']=='No'][col].hist(bins=30, alpha=0.5, label='No Churn', density=True)
+    df[df['Churn']=='Yes'][col].hist(bins=30, alpha=0.5, label='Churn', density=True)
+    plt.legend()
+    plt.xlabel(col)
+    plt.title(f'{col} Distribution by Churn')
+    plt.show()
+```
+
+**Key Findings**:
+- **tenure**: Churned customers have lower tenure (mean=18 vs 38 months)
+- **MonthlyCharges**: Churned customers pay MORE ($74 vs $61/month)
+- **TotalCharges**: Churned customers have lower total (correlated with tenure)
+
+```python
+# Correlation matrix
+numeric_df = df[['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen']].copy()
+numeric_df['Churn_binary'] = (df['Churn'] == 'Yes').astype(int)
+
+corr = numeric_df.corr()
+plt.figure(figsize=(8, 6))
+sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
+plt.title('Correlation Matrix')
+plt.show()
+```
+
+**Key Findings**:
+- tenure ↔ TotalCharges: 0.83 (strong positive correlation - expected)
+- Churn ↔ tenure: -0.35 (negative: longer tenure → less churn)
+- Churn ↔ MonthlyCharges: +0.19 (positive: higher charges → more churn)
+- Churn ↔ TotalCharges: -0.20 (negative: driven by tenure)
+
+### 5. Insights & Recommendations
+
+```python
+print("\n=== KEY FINDINGS ===")
+print("1. Data Quality:")
+print("   - 11 rows (<0.2%) dropped due to missing TotalCharges")
+print("   - No other quality issues. Data is clean.")
+print("")
+print("2. Churn Patterns:")
+print("   - Overall churn rate: 26.5% (slightly imbalanced)")
+print("   - Strongest predictor: Contract type (month-to-month 42.7% vs two-year 2.8%)")
+print("   - High-risk segment: New customers (<12mo tenure) with high monthly charges")
+print("   - Low churn: Long-term customers (>60mo) on two-year contracts")
+print("")
+print("3. Feature Importance:")
+print("   - **High signal**: Contract, tenure, PaymentMethod, SeniorCitizen")
+print("   - **Medium signal**: MonthlyCharges, InternetService")
+print("   - **Low signal**: gender, PhoneService (balanced across churn/no-churn)")
+print("")
+print("\n=== RECOMMENDED ACTIONS ===")
+print("1. Feature Engineering:")
+print("   - Create 'tenure_bucket' (0-12mo, 12-24mo, 24-60mo, >60mo)")
+print("   - Create 'high_charges' flag (MonthlyCharges > $70)")
+print("   - Interaction: tenure × Contract (captures switching cost)")
+print("   - Payment risk score (Electronic check is risky)")
+print("")
+print("2. Model Strategy:")
+print("   - Use all categorical features (one-hot encode)")
+print("   - Baseline: Predict churn for month-to-month + new customers")
+print("   - Advanced: Random Forest or Gradient Boosting (handle interactions)")
+print("   - Validate with stratified 5-fold CV (preserve 26.5% churn rate)")
+print("")
+print("3. Business Insights:")
+print("   - **Retention program**: Target month-to-month customers < 12mo tenure")
+print("   - **Contract incentives**: Offer discounts for one/two-year contracts")
+print("   - **Payment method**: Encourage auto-pay (reduce electronic check)")
+print("   - **Early warning**: Monitor customers with high MonthlyCharges + short tenure")
+```
+
+### 6. Self-Assessment
+
+Using rubric:
+
+- **Clarity** (5/5): Systematic exploration, clear findings at each stage
+- **Completeness** (5/5): Data quality, univariate, bivariate, insights all covered
+- **Rigor** (5/5): Proper statistical analysis, visualizations, quantified relationships
+- **Actionability** (5/5): Specific feature engineering and business recommendations
+
+**Average**: 5.0/5 ✓
+
+This EDA provides solid foundation for predictive modeling and business action.
+
+## Next Steps
+
+1. **Feature engineering**: Implement recommended features
+2. **Baseline model**: Logistic regression with top 5 features
+3. **Advanced models**: Random Forest, XGBoost with feature interactions
+4. **Evaluation**: F1-score, precision/recall curves, AUC-ROC
+5. **Deployment**: Real-time churn scoring API
--- a/skills/code-data-analysis-scaffolds/resources/examples/tdd-authentication.md
+++ b/skills/code-data-analysis-scaffolds/resources/examples/tdd-authentication.md
@@ -0,0 +1,226 @@
+# TDD Example: User Authentication
+
+Complete TDD example showing test-first development for authentication function.
+
+## Task
+
+Build a `validate_login(username, password)` function that:
+- Returns `True` for valid credentials
+- Returns `False` for invalid password
+- Raises `ValueError` for missing username/password
+- Raises `User Not FoundError` for nonexistent users
+- Logs failed attempts
+
+## Step 1: Write Tests FIRST
+
+```python
+# test_auth.py
+import pytest
+from auth import validate_login, UserNotFoundError
+
+# HAPPY PATH
+def test_valid_credentials():
+    """User with correct password should authenticate"""
+    assert validate_login("alice@example.com", "SecurePass123!") == True
+
+# EDGE CASES
+def test_empty_username():
+    """Empty username should raise ValueError"""
+    with pytest.raises(ValueError, match="Username required"):
+        validate_login("", "password")
+
+def test_empty_password():
+    """Empty password should raise ValueError"""
+    with pytest.raises(ValueError, match="Password required"):
+        validate_login("alice@example.com", "")
+
+def test_none_credentials():
+    """None values should raise ValueError"""
+    with pytest.raises(ValueError):
+        validate_login(None, None)
+
+# ERROR CONDITIONS
+def test_invalid_password():
+    """Wrong password should return False"""
+    assert validate_login("alice@example.com", "WrongPassword") == False
+
+def test_nonexistent_user():
+    """User not in database should raise UserNotFoundError"""
+    with pytest.raises(UserNotFoundError):
+        validate_login("nobody@example.com", "anypassword")
+
+def test_case_sensitive_password():
+    """Password check should be case-sensitive"""
+    assert validate_login("alice@example.com", "securepass123!") == False
+
+# STATE/SIDE EFFECTS
+def test_failed_attempt_logged(caplog):
+    """Failed login should be logged"""
+    validate_login("alice@example.com", "WrongPassword")
+    assert "Failed login attempt" in caplog.text
+    assert "alice@example.com" in caplog.text
+
+def test_successful_login_logged(caplog):
+    """Successful login should be logged"""
+    validate_login("alice@example.com", "SecurePass123!")
+    assert "Successful login" in caplog.text
+
+# INTEGRATION TEST
+@pytest.fixture
+def mock_database():
+    """Mock database with test users"""
+    return {
+        "alice@example.com": {
+            "password_hash": "hashed_SecurePass123!",
+            "salt": "random_salt_123"
+        }
+    }
+
+def test_database_integration(mock_database, monkeypatch):
+    """Function should query database correctly"""
+    def mock_get_user(username):
+        return mock_database.get(username)
+
+    monkeypatch.setattr("auth.get_user_from_db", mock_get_user)
+    result = validate_login("alice@example.com", "SecurePass123!")
+    assert result == True
+```
+
+## Step 2: Run Tests (They Should FAIL - Red)
+
+```bash
+$ pytest test_auth.py
+FAILED - ModuleNotFoundError: No module named 'auth'
+```
+
+## Step 3: Write Minimal Implementation (Green)
+
+```python
+# auth.py
+import logging
+import hashlib
+
+logger = logging.getLogger(__name__)
+
+class UserNotFoundError(Exception):
+    pass
+
+def validate_login(username, password):
+    # Input validation
+    if not username:
+        raise ValueError("Username required")
+    if not password:
+        raise ValueError("Password required")
+
+    # Get user from database
+    user = get_user_from_db(username)
+    if user is None:
+        raise UserNotFoundError(f"User {username} not found")
+
+    # Hash password and compare
+    password_hash = hash_password(password, user['salt'])
+    is_valid = (password_hash == user['password_hash'])
+
+    # Log attempt
+    if is_valid:
+        logger.info(f"Successful login for {username}")
+    else:
+        logger.warning(f"Failed login attempt for {username}")
+
+    return is_valid
+
+def get_user_from_db(username):
+    # Stub - implement database query
+    users = {
+        "alice@example.com": {
+            "password_hash": hash_password("SecurePass123!", "random_salt_123"),
+            "salt": "random_salt_123"
+        }
+    }
+    return users.get(username)
+
+def hash_password(password, salt):
+    # Simplified - use bcrypt/argon2 in production
+    return hashlib.sha256(f"{password}{salt}".encode()).hexdigest()
+```
+
+## Step 4: Run Tests Again (Should PASS - Green)
+
+```bash
+$ pytest test_auth.py -v
+test_valid_credentials PASSED
+test_empty_username PASSED
+test_empty_password PASSED
+test_none_credentials PASSED
+test_invalid_password PASSED
+test_nonexistent_user PASSED
+test_case_sensitive_password PASSED
+test_failed_attempt_logged PASSED
+test_successful_login_logged PASSED
+test_database_integration PASSED
+
+========== 10 passed in 0.15s ==========
+```
+
+## Step 5: Refactor (Keep Tests Green)
+
+```python
+# auth.py (refactored for readability)
+class AuthenticationService:
+    def __init__(self, user_repo, password_hasher):
+        self.user_repo = user_repo
+        self.password_hasher = password_hasher
+        self.logger = logging.getLogger(__name__)
+
+    def validate_login(self, username, password):
+        self._validate_inputs(username, password)
+        user = self._get_user(username)
+        is_valid = self._check_password(password, user)
+        self._log_attempt(username, is_valid)
+        return is_valid
+
+    def _validate_inputs(self, username, password):
+        if not username:
+            raise ValueError("Username required")
+        if not password:
+            raise ValueError("Password required")
+
+    def _get_user(self, username):
+        user = self.user_repo.get_by_username(username)
+        if user is None:
+            raise UserNotFoundError(f"User {username} not found")
+        return user
+
+    def _check_password(self, password, user):
+        password_hash = self.password_hasher.hash(password, user.salt)
+        return password_hash == user.password_hash
+
+    def _log_attempt(self, username, is_valid):
+        if is_valid:
+            self.logger.info(f"Successful login for {username}")
+        else:
+            self.logger.warning(f"Failed login attempt for {username}")
+```
+
+Tests still pass after refactoring!
+
+## Key Takeaways
+
+1. **Tests written FIRST** define expected behavior
+2. **Minimal implementation** to make tests pass
+3. **Refactor** with confidence (tests catch regressions)
+4. **Comprehensive coverage**: happy path, edge cases, errors, side effects, integration
+5. **Fast feedback**: Know immediately if something breaks
+
+## Self-Assessment
+
+Using rubric:
+
+- **Clarity** (5/5): Requirements clearly defined by tests
+- **Completeness** (5/5): All cases covered (happy, edge, error, integration)
+- **Rigor** (5/5): TDD cycle followed (Red → Green → Refactor)
+- **Actionability** (5/5): Tests are executable specification
+
+**Average**: 5.0/5 ✓
+
+This is production-ready test-first code.
--- a/skills/code-data-analysis-scaffolds/resources/methodology.md
+++ b/skills/code-data-analysis-scaffolds/resources/methodology.md
@@ -0,0 +1,470 @@
+# Code Data Analysis Scaffolds Methodology
+
+Advanced techniques for causal inference, predictive modeling, property-based testing, and complex data analysis.
+
+## Workflow
+
+Copy this checklist and track your progress:
+
+```
+Code Data Analysis Scaffolds Progress:
+- [ ] Step 1: Clarify task and objectives
+- [ ] Step 2: Choose appropriate scaffold type
+- [ ] Step 3: Generate scaffold structure
+- [ ] Step 4: Validate scaffold completeness
+- [ ] Step 5: Deliver scaffold and guide execution
+```
+
+**Step 1: Clarify task** - Assess complexity and determine if advanced techniques needed. See [1. When to Use Advanced Methods](#1-when-to-use-advanced-methods).
+
+**Step 2: Choose scaffold** - Select from Causal Inference, Predictive Modeling, Property-Based Testing, or Advanced EDA. See specific sections below.
+
+**Step 3: Generate structure** - Apply advanced scaffold matching task complexity. See [2. Causal Inference Methods](#2-causal-inference-methods), [3. Predictive Modeling Pipeline](#3-predictive-modeling-pipeline), [4. Property-Based Testing](#4-property-based-testing), [5. Advanced EDA Techniques](#5-advanced-eda-techniques).
+
+**Step 4: Validate** - Check assumptions, sensitivity analysis, robustness checks using [6. Advanced Validation Patterns](#6-advanced-validation-patterns).
+
+**Step 5: Deliver** - Present with caveats, limitations, and recommendations for further analysis.
+
+## 1. When to Use Advanced Methods
+
+| Task Characteristic | Standard Template | Advanced Methodology |
+|---------------------|-------------------|---------------------|
+| **Causal question** | "Does X correlate with Y?" | "Does X cause Y?" → Causal inference needed |
+| **Sample size** | < 1000 rows | > 10K rows with complex patterns |
+| **Model complexity** | Linear/logistic regression | Ensemble methods, neural nets, feature interactions |
+| **Test sophistication** | Unit tests, integration tests | Property-based tests, mutation testing, fuzz testing |
+| **Data complexity** | Clean, tabular data | Multi-modal, high-dimensional, unstructured data |
+| **Stakes** | Low (exploratory) | High (production ML, regulatory compliance) |
+
+## 2. Causal Inference Methods
+
+Use when research question is "Does X **cause** Y?" not just "Are X and Y correlated?"
+
+### Causal Inference Scaffold
+
+```python
+# CAUSAL INFERENCE SCAFFOLD
+
+# 1. DRAW CAUSAL DAG (Directed Acyclic Graph)
+# Explicitly model: Treatment → Outcome, Confounders → Treatment & Outcome
+#
+#  Example:
+#    Education → Income
+#         ↑          ↑
+#    Family Background
+#
+# Treatment: Education
+# Outcome: Income
+# Confounder: Family Background (affects both education and income)
+
+# 2. IDENTIFY CONFOUNDERS
+confounders = ['age', 'gender', 'family_income', 'region']
+# These variables affect BOTH treatment and outcome
+# If not controlled, they bias causal estimate
+
+# 3. CHECK IDENTIFICATION ASSUMPTIONS
+# For causal effect to be identifiable:
+# a) No unmeasured confounders (all variables in DAG observed)
+# b) Treatment assignment as-if random conditional on confounders
+# c) Positivity: Every unit has nonzero probability of treatment/control
+
+# 4. CHOOSE IDENTIFICATION STRATEGY
+
+# Option A: RCT - Random assignment eliminates confounding. Check balance on confounders.
+from scipy import stats
+for var in confounders:
+    _, p = stats.ttest_ind(treatment_group[var], control_group[var])
+    print(f"{var}: {'✓' if p > 0.05 else '✗'} balanced")
+
+# Option B: Regression - Control for confounders. Assumes no unmeasured confounding.
+import statsmodels.formula.api as smf
+model = smf.ols('outcome ~ treatment + age + gender + family_income', data=df).fit()
+treatment_effect = model.params['treatment']
+
+# Option C: Propensity Score Matching - Match treated to similar controls on P(treatment|X).
+from sklearn.linear_model import LogisticRegression; from sklearn.neighbors import NearestNeighbors
+ps_model = LogisticRegression().fit(df[confounders], df['treatment'])
+df['ps'] = ps_model.predict_proba(df[confounders])[:,1]
+treated, control = df[df['treatment']==1], df[df['treatment']==0]
+nn = NearestNeighbors(n_neighbors=1).fit(control[['ps']])
+_, indices = nn.kneighbors(treated[['ps']])
+treatment_effect = treated['outcome'].mean() - control.iloc[indices.flatten()]['outcome'].mean()
+
+# Option D: IV - Need instrument Z: affects treatment, not outcome (except through treatment).
+from statsmodels.sandbox.regression.gmm import IV2SLS
+iv_model = IV2SLS(df['income'], df[['education'] + confounders], df[['instrument'] + confounders]).fit()
+
+# Option E: RDD - Treatment assigned at cutoff. Compare units just above/below threshold.
+df['above_cutoff'] = (df['running_var'] >= cutoff).astype(int)
+# Use local linear regression around cutoff to estimate effect
+
+# Option F: DiD - Compare treatment vs control, before vs after. Assumes parallel trends.
+t_before, t_after = df[(df['group']=='T') & (df['time']=='before')]['y'].mean(), df[(df['group']=='T') & (df['time']=='after')]['y'].mean()
+c_before, c_after = df[(df['group']=='C') & (df['time']=='before')]['y'].mean(), df[(df['group']=='C') & (df['time']=='after')]['y'].mean()
+did_estimate = (t_after - t_before) - (c_after - c_before)
+
+# 5. SENSITIVITY ANALYSIS
+print("\n=== SENSITIVITY CHECKS ===")
+print("1. Unmeasured confounding: How strong would confounder need to be to change conclusion?")
+print("2. Placebo tests: Check for effect in period before treatment (should be zero)")
+print("3. Falsification tests: Check for effect on outcome that shouldn't be affected")
+print("4. Robustness: Try different model specifications, subsamples, bandwidths (RDD)")
+
+# 6. REPORT CAUSAL ESTIMATE WITH UNCERTAINTY
+print(f"\nCausal Effect: {treatment_effect:.3f}")
+print(f"95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")
+print(f"Interpretation: Treatment X causes {treatment_effect:.1%} change in outcome Y")
+print(f"Assumptions: [List key identifying assumptions]")
+print(f"Limitations: [Threats to validity]")
+```
+
+### Causal Inference Checklist
+
+- [ ] **Causal question clearly stated**: "Does X cause Y?" not "Are X and Y related?"
+- [ ] **DAG drawn**: Treatment, outcome, confounders, mediators identified
+- [ ] **Identification strategy chosen**: RCT, regression, PS matching, IV, RDD, DiD
+- [ ] **Assumptions checked**: No unmeasured confounding, positivity, parallel trends (DiD), etc.
+- [ ] **Sensitivity analysis**: Test robustness to violations of assumptions
+- [ ] **Limitations acknowledged**: Threats to internal/external validity stated
+
+## 3. Predictive Modeling Pipeline
+
+Use for forecasting, classification, regression - when goal is prediction not causal understanding.
+
+### Predictive Modeling Scaffold
+
+```python
+# PREDICTIVE MODELING SCAFFOLD
+
+# 1. DEFINE PREDICTION TASK & METRIC
+task = "Predict customer churn (binary classification)"
+primary_metric = "F1-score"  # Balance precision/recall
+secondary_metrics = ["AUC-ROC", "precision", "recall", "accuracy"]
+
+# 2. TRAIN/VAL/TEST SPLIT (before any preprocessing!)
+from sklearn.model_selection import train_test_split
+
+# Split: 60% train, 20% validation, 20% test
+train_val, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])
+train, val = train_test_split(train_val, test_size=0.25, random_state=42, stratify=train_val['target'])
+
+print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
+print(f"Class balance - Train: {train['target'].mean():.2%}, Test: {test['target'].mean():.2%}")
+
+# 3. FEATURE ENGINEERING (fit on train, transform train/val/test)
+from sklearn.preprocessing import StandardScaler
+from sklearn.impute import SimpleImputer
+
+# Numeric features: impute missing, standardize
+numeric_features = ['age', 'income', 'tenure']
+num_imputer = SimpleImputer(strategy='median').fit(train[numeric_features])
+num_scaler = StandardScaler().fit(num_imputer.transform(train[numeric_features]))
+
+X_train_num = num_scaler.transform(num_imputer.transform(train[numeric_features]))
+X_val_num = num_scaler.transform(num_imputer.transform(val[numeric_features]))
+X_test_num = num_scaler.transform(num_imputer.transform(test[numeric_features]))
+
+# Categorical features: one-hot encode
+from sklearn.preprocessing import OneHotEncoder
+cat_features = ['region', 'product_type']
+cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False).fit(train[cat_features])
+
+X_train_cat = cat_encoder.transform(train[cat_features])
+X_val_cat = cat_encoder.transform(val[cat_features])
+X_test_cat = cat_encoder.transform(test[cat_features])
+
+# Combine features
+import numpy as np
+X_train = np.hstack([X_train_num, X_train_cat])
+X_val = np.hstack([X_val_num, X_val_cat])
+X_test = np.hstack([X_test_num, X_test_cat])
+y_train, y_val, y_test = train['target'], val['target'], test['target']
+
+# 4. BASELINE MODEL (always start simple!)
+from sklearn.dummy import DummyClassifier
+baseline = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
+baseline_f1 = f1_score(y_val, baseline.predict(X_val))
+print(f"Baseline F1: {baseline_f1:.3f}")
+
+# 5. MODEL SELECTION & HYPERPARAMETER TUNING
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import f1_score, roc_auc_score
+
+# Try multiple models
+models = {
+    'Logistic Regression': LogisticRegression(max_iter=1000),
+    'Random Forest': RandomForestClassifier(random_state=42),
+    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
+}
+
+results = {}
+for name, model in models.items():
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_val)
+    y_proba = model.predict_proba(X_val)[:,1]
+
+    results[name] = {
+        'F1': f1_score(y_val, y_pred),
+        'AUC': roc_auc_score(y_val, y_proba),
+        'Precision': precision_score(y_val, y_pred),
+        'Recall': recall_score(y_val, y_pred)
+    }
+    print(f"{name}: F1={results[name]['F1']:.3f}, AUC={results[name]['AUC']:.3f}")
+
+# Select best model (highest F1 on validation)
+best_model_name = max(results, key=lambda x: results[x]['F1'])
+best_model = models[best_model_name]
+print(f"\nBest model: {best_model_name}")
+
+# Hyperparameter tuning on best model
+if best_model_name == 'Random Forest':
+    param_grid = {
+        'n_estimators': [100, 200, 300],
+        'max_depth': [10, 20, None],
+        'min_samples_split': [2, 5, 10]
+    }
+    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
+    grid_search.fit(X_train, y_train)
+    best_model = grid_search.best_estimator_
+    print(f"Best params: {grid_search.best_params_}")
+
+# 6. CROSS-VALIDATION (check for overfitting)
+from sklearn.model_selection import cross_val_score
+cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1')
+print(f"CV F1 scores: {cv_scores}")
+print(f"Mean: {cv_scores.mean():.3f}, Std: {cv_scores.std():.3f}")
+
+# 7. FINAL EVALUATION ON TEST SET (only once!)
+y_test_pred = best_model.predict(X_test)
+y_test_proba = best_model.predict_proba(X_test)[:,1]
+
+test_f1 = f1_score(y_test, y_test_pred)
+test_auc = roc_auc_score(y_test, y_test_proba)
+print(f"\n=== FINAL TEST PERFORMANCE ===")
+print(f"F1: {test_f1:.3f}, AUC: {test_auc:.3f}")
+
+# 8. ERROR ANALYSIS
+from sklearn.metrics import confusion_matrix, classification_report
+print("\nConfusion Matrix:")
+print(confusion_matrix(y_test, y_test_pred))
+print("\nClassification Report:")
+print(classification_report(y_test, y_test_pred))
+
+# Analyze misclassifications
+test_df = test.copy()
+test_df['prediction'] = y_test_pred
+test_df['prediction_proba'] = y_test_proba
+false_positives = test_df[(test_df['target']==0) & (test_df['prediction']==1)]
+false_negatives = test_df[(test_df['target']==1) & (test_df['prediction']==0)]
+print(f"False Positives: {len(false_positives)}")
+print(f"False Negatives: {len(false_negatives)}")
+# Inspect these cases to understand failure modes
+
+# 9. FEATURE IMPORTANCE
+if hasattr(best_model, 'feature_importances_'):
+    feature_names = numeric_features + list(cat_encoder.get_feature_names_out(cat_features))
+    importances = pd.DataFrame({
+        'feature': feature_names,
+        'importance': best_model.feature_importances_
+    }).sort_values('importance', ascending=False)
+    print("\nTop 10 Features:")
+    print(importances.head(10))
+
+# 10. MODEL DEPLOYMENT CHECKLIST
+print("\n=== DEPLOYMENT READINESS ===")
+print(f"✓ Test F1 ({test_f1:.3f}) > Baseline ({baseline_f1:.3f})")
+print(f"✓ Cross-validation shows consistent performance (CV std={cv_scores.std():.3f})")
+print("✓ Error analysis completed, failure modes understood")
+print("✓ Feature importance computed, no surprising features")
+print("□ Model serialized and saved")
+print("□ Monitoring plan in place (track drift in input features, output distribution)")
+print("□ Rollback plan if model underperforms in production")
+```
+
+### Predictive Modeling Checklist
+
+- [ ] **Clear prediction task**: Classification, regression, time series forecasting
+- [ ] **Appropriate metrics**: Match business objectives (precision vs recall tradeoff, etc.)
+- [ ] **Train/val/test split**: Before any preprocessing (no data leakage)
+- [ ] **Baseline model**: Simple model for comparison
+- [ ] **Feature engineering**: Proper handling of missing values, scaling, encoding
+- [ ] **Cross-validation**: k-fold CV to check for overfitting
+- [ ] **Model selection**: Compare multiple model types
+- [ ] **Hyperparameter tuning**: Grid/random search on validation set
+- [ ] **Error analysis**: Understand failure modes, inspect misclassifications
+- [ ] **Test set evaluation**: Final performance check (only once!)
+- [ ] **Deployment readiness**: Monitoring, rollback plan, model versioning
+
+## 4. Property-Based Testing
+
+Use for testing complex logic, data transformations, invariants. Goes beyond example-based tests.
+
+### Property-Based Testing Scaffold
+
+```python
+# PROPERTY-BASED TESTING SCAFFOLD
+from hypothesis import given, strategies as st
+import pytest
+
+# Example: Testing a sort function
+def my_sort(lst):
+    return sorted(lst)
+
+# Property 1: Output length equals input length
+@given(st.lists(st.integers()))
+def test_sort_preserves_length(lst):
+    assert len(my_sort(lst)) == len(lst)
+
+# Property 2: Output is sorted (each element <= next element)
+@given(st.lists(st.integers()))
+def test_sort_is_sorted(lst):
+    result = my_sort(lst)
+    for i in range(len(result) - 1):
+        assert result[i] <= result[i+1]
+
+# Property 3: Output contains same elements as input (multiset equality)
+@given(st.lists(st.integers()))
+def test_sort_preserves_elements(lst):
+    result = my_sort(lst)
+    assert sorted(lst) == sorted(result)  # Canonical form comparison
+
+# Property 4: Idempotence (sorting twice = sorting once)
+@given(st.lists(st.integers()))
+def test_sort_is_idempotent(lst):
+    result = my_sort(lst)
+    assert my_sort(result) == result
+
+# Property 5: Empty input → empty output
+def test_sort_empty_list():
+    assert my_sort([]) == []
+
+# Property 6: Single element → unchanged
+@given(st.integers())
+def test_sort_single_element(x):
+    assert my_sort([x]) == [x]
+```
+
+### Property-Based Testing Strategies
+
+**For data transformations:**
+- Idempotence: `f(f(x)) == f(x)`
+- Round-trip: `decode(encode(x)) == x`
+- Commutativity: `f(g(x)) == g(f(x))`
+- Invariants: Properties that never change (e.g., sum after transformation)
+
+**For numeric functions:**
+- Boundary conditions: Zero, negative, very large numbers
+- Inverse relationships: `f(f_inverse(x)) ≈ x`
+- Known identities: `sin²(x) + cos²(x) = 1`
+
+**For string/list operations:**
+- Length preservation or predictable change
+- Character/element preservation
+- Order properties (sorted, reversed)
+
+## 5. Advanced EDA Techniques
+
+For high-dimensional, multi-modal, or complex data.
+
+### Dimensionality Reduction
+
+```python
+# PCA: Linear dimensionality reduction
+from sklearn.decomposition import PCA
+pca = PCA(n_components=2)
+X_pca = pca.fit_transform(X_scaled)
+print(f"Explained variance: {pca.explained_variance_ratio_}")
+
+# t-SNE: Non-linear, good for visualization
+from sklearn.manifold import TSNE
+tsne = TSNE(n_components=2, perplexity=30, random_state=42)
+X_tsne = tsne.fit_transform(X_scaled)
+plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y, cmap='viridis'); plt.show()
+
+# UMAP: Faster alternative to t-SNE, preserves global structure
+# pip install umap-learn
+import umap
+reducer = umap.UMAP(n_components=2, random_state=42)
+X_umap = reducer.fit_transform(X_scaled)
+```
+
+### Cluster Analysis
+
+```python
+from sklearn.cluster import KMeans, DBSCAN
+from sklearn.metrics import silhouette_score
+
+# Elbow method: Find optimal K
+inertias = []
+for k in range(2, 11):
+    kmeans = KMeans(n_clusters=k, random_state=42)
+    kmeans.fit(X_scaled)
+    inertias.append(kmeans.inertia_)
+plt.plot(range(2, 11), inertias); plt.xlabel('K'); plt.ylabel('Inertia'); plt.show()
+
+# Silhouette score: Measure cluster quality
+for k in range(2, 11):
+    kmeans = KMeans(n_clusters=k, random_state=42).fit(X_scaled)
+    score = silhouette_score(X_scaled, kmeans.labels_)
+    print(f"K={k}: Silhouette={score:.3f}")
+
+# DBSCAN: Density-based clustering (finds arbitrary shapes)
+dbscan = DBSCAN(eps=0.5, min_samples=5)
+clusters = dbscan.fit_predict(X_scaled)
+print(f"Clusters found: {len(set(clusters)) - (1 if -1 in clusters else 0)}")
+print(f"Noise points: {(clusters == -1).sum()}")
+```
+
+## 6. Advanced Validation Patterns
+
+### Mutation Testing
+
+Tests the quality of your tests by introducing bugs and checking if tests catch them.
+
+```python
+# Install: pip install mutmut
+# Run: mutmut run --paths-to-mutate=src/
+# Check: mutmut results
+# Survivors (mutations not caught) indicate weak tests
+```
+
+### Fuzz Testing
+
+Generate random/malformed inputs to find edge cases.
+
+```python
+from hypothesis import given, strategies as st
+
+@given(st.text())
+def test_function_doesnt_crash_on_any_string(s):
+    result = my_function(s)  # Should never raise exception
+    assert result is not None
+```
+
+### Data Validation Framework (Great Expectations)
+
+```python
+import great_expectations as gx
+
+# Define expectations
+expectation_suite = gx.ExpectationSuite(name="my_data_suite")
+expectation_suite.add_expectation(gx.expectations.ExpectColumnToExist(column="user_id"))
+expectation_suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="user_id"))
+expectation_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="age", min_value=0, max_value=120))
+
+# Validate data
+results = context.run_validation(batch_request, expectation_suite)
+print(results["success"])  # True if all expectations met
+```
+
+## 7. When to Use Each Method
+
+| Research Goal | Method | Key Consideration |
+|---------------|--------|-------------------|
+| Causal effect estimation | RCT, IV, RDD, DiD | Identify confounders, check assumptions |
+| Prediction/forecasting | Supervised ML | Avoid data leakage, validate out-of-sample |
+| Pattern discovery | Clustering, PCA, t-SNE | Dimensionality reduction first if high-D |
+| Complex logic testing | Property-based testing | Define invariants that must hold |
+| Data quality | Great Expectations | Automate checks in pipelines |
--- a/skills/code-data-analysis-scaffolds/resources/template.md
+++ b/skills/code-data-analysis-scaffolds/resources/template.md
@@ -0,0 +1,391 @@
+# Code Data Analysis Scaffolds Template
+
+## Workflow
+
+Copy this checklist and track your progress:
+
+```
+Code Data Analysis Scaffolds Progress:
+- [ ] Step 1: Clarify task and objectives
+- [ ] Step 2: Choose appropriate scaffold type
+- [ ] Step 3: Generate scaffold structure
+- [ ] Step 4: Validate scaffold completeness
+- [ ] Step 5: Deliver scaffold and guide execution
+```
+
+**Step 1: Clarify task** - Ask context questions to understand task type, constraints, expected outcomes. See [Context Questions](#context-questions).
+
+**Step 2: Choose scaffold** - Select TDD, EDA, Statistical Analysis, or Validation based on task. See [Scaffold Selection Guide](#scaffold-selection-guide).
+
+**Step 3: Generate structure** - Use appropriate scaffold template. See [TDD Scaffold](#tdd-scaffold), [EDA Scaffold](#eda-scaffold), [Statistical Analysis Scaffold](#statistical-analysis-scaffold), or [Validation Scaffold](#validation-scaffold).
+
+**Step 4: Validate completeness** - Check scaffold covers requirements, includes validation steps, makes assumptions explicit. See [Quality Checklist](#quality-checklist).
+
+**Step 5: Deliver and guide** - Present scaffold, highlight next steps, surface any gaps discovered. Execute if user wants help.
+
+## Context Questions
+
+**For all tasks:**
+- What are you trying to accomplish? (Specific outcome expected)
+- What's the context? (Dataset characteristics, codebase state, existing work)
+- Any constraints? (Time, tools, data limitations, performance requirements)
+- What does success look like? (Acceptance criteria, quality bar)
+
+**For TDD tasks:**
+- What functionality needs tests? (Feature, bug fix, refactor)
+- Existing test coverage? (None, partial, comprehensive)
+- Test framework preference? (pytest, jest, junit, etc.)
+- Integration vs unit tests? (Scope of testing)
+
+**For EDA tasks:**
+- What's the dataset? (Size, format, source)
+- What questions are you trying to answer? (Exploratory vs. hypothesis-driven)
+- Existing knowledge about data? (Schema, distributions, known issues)
+- End goal? (Feature engineering, quality assessment, insights)
+
+**For Statistical/Modeling tasks:**
+- What's the research question? (Descriptive, predictive, causal)
+- Available data? (Sample size, variables, treatment/control)
+- Causal or predictive goal? (Understanding why vs. forecasting what)
+- Significance level / acceptable error rate?
+
+## Scaffold Selection Guide
+
+| User Says | Task Type | Scaffold to Use |
+|-----------|-----------|-----------------|
+| "Write tests for..." | TDD | [TDD Scaffold](#tdd-scaffold) |
+| "Explore this dataset..." | EDA | [EDA Scaffold](#eda-scaffold) |
+| "Analyze the effect of..." / "Does X cause Y?" | Causal Inference | See methodology.md |
+| "Predict..." / "Classify..." / "Forecast..." | Predictive Modeling | See methodology.md |
+| "Design an A/B test..." / "Compare groups..." | Statistical Analysis | [Statistical Analysis Scaffold](#statistical-analysis-scaffold) |
+| "Validate..." / "Check quality..." | Validation | [Validation Scaffold](#validation-scaffold) |
+
+## TDD Scaffold
+
+Use when writing new code, refactoring, or fixing bugs. **Write tests FIRST, then implement.**
+
+### Quick Template
+
+```python
+# Test file: test_[module].py
+import pytest
+from [module] import [function_to_test]
+
+# 1. HAPPY PATH TESTS (expected usage)
+def test_[function]_with_valid_input():
+    """Test normal, expected behavior"""
+    result = [function](valid_input)
+    assert result == expected_output
+    assert result.property == expected_value
+
+# 2. EDGE CASE TESTS (boundary conditions)
+def test_[function]_with_empty_input():
+    """Test with empty/minimal input"""
+    result = [function]([])
+    assert result == expected_for_empty
+
+def test_[function]_with_maximum_input():
+    """Test with large/maximum input"""
+    result = [function](large_input)
+    assert result is not None
+
+# 3. ERROR CONDITION TESTS (invalid input, expected failures)
+def test_[function]_with_invalid_input():
+    """Test proper error handling"""
+    with pytest.raises(ValueError):
+        [function](invalid_input)
+
+def test_[function]_with_none_input():
+    """Test None handling"""
+    with pytest.raises(TypeError):
+        [function](None)
+
+# 4. STATE TESTS (if function modifies state)
+def test_[function]_modifies_state_correctly():
+    """Test side effects are correct"""
+    obj = Object()
+    obj.[function](param)
+    assert obj.state == expected_state
+
+# 5. INTEGRATION TESTS (if interacting with external systems)
+@pytest.fixture
+def mock_external_service():
+    """Mock external dependencies"""
+    return Mock(spec=ExternalService)
+
+def test_[function]_with_external_service(mock_external_service):
+    """Test integration points"""
+    result = [function](mock_external_service)
+    mock_external_service.method.assert_called_once()
+    assert result == expected_from_integration
+```
+
+### Test Data Setup
+
+```python
+# conftest.py or test fixtures
+@pytest.fixture
+def sample_data():
+    """Reusable test data"""
+    return {
+        "valid": [...],
+        "edge_case": [...],
+        "invalid": [...]
+    }
+
+@pytest.fixture(scope="session")
+def database_session():
+    """Database for integration tests"""
+    db = create_test_db()
+    yield db
+    db.cleanup()
+```
+
+### TDD Cycle
+
+1. **Red**: Write failing test (defines what success looks like)
+2. **Green**: Write minimal code to make test pass
+3. **Refactor**: Improve code while keeping tests green
+4. **Repeat**: Next test case
+
+## EDA Scaffold
+
+Use when exploring new dataset. Follow systematic plan to understand data quality and patterns.
+
+### Quick Template
+
+```python
+# 1. DATA OVERVIEW
+# Load and inspect
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+df = pd.read_[format]('data.csv')
+
+# Basic info
+print(f"Shape: {df.shape}")
+print(f"Columns: {df.columns.tolist()}")
+print(df.dtypes)
+print(df.head())
+print(df.info())
+print(df.describe())
+
+# 2. DATA QUALITY CHECKS
+# Missing values
+missing = df.isnull().sum()
+missing_pct = (missing / len(df)) * 100
+print(missing_pct[missing_pct > 0])
+
+# Duplicates
+print(f"Duplicates: {df.duplicated().sum()}")
+
+# Data types consistency
+print("Check: Are numeric columns actually numeric?")
+print("Check: Are dates parsed correctly?")
+print("Check: Are categorical variables encoded properly?")
+
+# 3. UNIVARIATE ANALYSIS
+# Numeric: mean, median, std, range, distribution plots, outliers (IQR method)
+for col in df.select_dtypes(include=[np.number]).columns:
+    print(f"{col}: mean={df[col].mean():.2f}, median={df[col].median():.2f}, std={df[col].std():.2f}")
+    df[col].hist(bins=50); plt.title(f'{col} Distribution'); plt.show()
+    Q1, Q3 = df[col].quantile([0.25, 0.75])
+    outliers = ((df[col] < (Q1 - 1.5*(Q3-Q1))) | (df[col] > (Q3 + 1.5*(Q3-Q1)))).sum()
+    print(f"  Outliers: {outliers} ({outliers/len(df)*100:.1f}%)")
+
+# Categorical: value counts, unique values, bar plots
+for col in df.select_dtypes(include=['object', 'category']).columns:
+    print(f"{col}: {df[col].nunique()} unique, most common={df[col].mode()[0]}")
+    df[col].value_counts().head(10).plot(kind='bar'); plt.show()
+
+# 4. BIVARIATE ANALYSIS
+# Correlation heatmap, pairplots, categorical vs numeric boxplots
+sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')
+sns.pairplot(df[['var1', 'var2', 'var3', 'target']], hue='target'); plt.show()
+# For each categorical-numeric pair, create boxplots to see distributions
+
+# 5. INSIGHTS & NEXT STEPS
+print("\n=== KEY FINDINGS ===")
+print("1. Data quality: [summary]")
+print("2. Distributions: [any skewness, outliers]")
+print("3. Correlations: [strong relationships found]")
+print("4. Missing patterns: [systematic missingness?]")
+print("\n=== RECOMMENDED ACTIONS ===")
+print("1. Handle missing data: [imputation strategy]")
+print("2. Address outliers: [cap, remove, transform]")
+print("3. Feature engineering: [ideas based on EDA]")
+print("4. Data transformations: [log, standardize, encode]")
+```
+
+### EDA Checklist
+
+- [ ] Load data and check shape/dtypes
+- [ ] Assess missing values (how much, which variables, patterns?)
+- [ ] Check for duplicates
+- [ ] Validate data types (numeric, categorical, dates)
+- [ ] Univariate analysis (distributions, outliers, summary stats)
+- [ ] Bivariate analysis (correlations, relationships with target)
+- [ ] Identify data quality issues
+- [ ] Document insights and recommended next steps
+
+## Statistical Analysis Scaffold
+
+Use for hypothesis testing, A/B tests, comparing groups.
+
+### Quick Template
+
+```python
+# STATISTICAL ANALYSIS SCAFFOLD
+
+# 1. DEFINE RESEARCH QUESTION
+question = "Does treatment X improve outcome Y?"
+
+# 2. STATE HYPOTHESES
+H0 = "Treatment X has no effect on outcome Y (null hypothesis)"
+H1 = "Treatment X improves outcome Y (alternative hypothesis)"
+
+# 3. SET SIGNIFICANCE LEVEL
+alpha = 0.05  # 5% significance level (Type I error rate)
+power = 0.80  # 80% power (1 - Type II error rate)
+
+# 4. CHECK ASSUMPTIONS (t-test: independence, normality, equal variance)
+from scipy import stats
+_, p_norm = stats.shapiro(treatment_group)  # Normality test
+_, p_var = stats.levene(treatment_group, control_group)  # Equal variance test
+print(f"Normality: p={p_norm:.3f} {'✓' if p_norm > 0.05 else '✗ use non-parametric'}")
+print(f"Equal variance: p={p_var:.3f} {'✓' if p_var > 0.05 else '✗ use Welch t-test'}")
+
+# 5. PERFORM STATISTICAL TEST
+# Choose appropriate test based on data type and assumptions
+
+# For continuous outcome, 2 groups:
+statistic, p_value = stats.ttest_ind(treatment_group, control_group)
+print(f"t-statistic: {statistic:.3f}, p-value: {p_value:.4f}")
+
+# For categorical outcome, 2 groups:
+from scipy.stats import chi2_contingency
+contingency_table = pd.crosstab(df['group'], df['outcome'])
+chi2, p_value, dof, expected = chi2_contingency(contingency_table)
+print(f"Chi-square: {chi2:.3f}, p-value: {p_value:.4f}")
+
+# 6. INTERPRET RESULTS & EFFECT SIZE
+if p_value < alpha:
+    cohen_d = (treatment_group.mean() - control_group.mean()) / pooled_std
+    effect = "Small" if abs(cohen_d) < 0.2 else "Medium" if abs(cohen_d) < 0.5 else "Large"
+    print(f"REJECT H0 (p={p_value:.4f}). Effect size (Cohen's d)={cohen_d:.3f} ({effect})")
+else:
+    print(f"FAIL TO REJECT H0 (p={p_value:.4f}). Insufficient evidence for effect.")
+
+# 7. CONFIDENCE INTERVAL & SENSITIVITY
+ci_95 = stats.t.interval(0.95, len(treatment_group)-1, loc=treatment_group.mean(), scale=stats.sem(treatment_group))
+print(f"95% CI: [{ci_95[0]:.2f}, {ci_95[1]:.2f}]")
+print("Sensitivity: Check without outliers, with non-parametric test, with confounders")
+```
+
+### Statistical Test Selection
+
+| Data Type | # Groups | Test |
+|-----------|----------|------|
+| Continuous | 2 | t-test (or Welch's if unequal variance) |
+| Continuous | 3+ | ANOVA (or Kruskal-Wallis if non-normal) |
+| Categorical | 2 | Chi-square or Fisher's exact |
+| Ordinal | 2 | Mann-Whitney U |
+| Paired/Repeated | 2 | Paired t-test or Wilcoxon signed-rank |
+
+## Validation Scaffold
+
+Use for validating data quality, code quality, or model quality before shipping.
+
+### Data Validation Template
+
+```python
+# DATA VALIDATION CHECKLIST
+
+# 1. SCHEMA VALIDATION
+expected_columns = ['id', 'timestamp', 'value', 'category']
+assert set(df.columns) == set(expected_columns), "Column mismatch"
+
+expected_dtypes = {'id': 'int64', 'timestamp': 'datetime64', 'value': 'float64', 'category': 'object'}
+for col, dtype in expected_dtypes.items():
+    assert df[col].dtype == dtype, f"{col} type mismatch: expected {dtype}, got {df[col].dtype}"
+
+# 2. RANGE VALIDATION
+assert df['value'].min() >= 0, "Negative values found (should be >= 0)"
+assert df['value'].max() <= 100, "Values exceed maximum (should be <= 100)"
+
+# 3. UNIQUENESS VALIDATION
+assert df['id'].is_unique, "Duplicate IDs found"
+
+# 4. COMPLETENESS VALIDATION
+required_fields = ['id', 'value']
+for field in required_fields:
+    missing_pct = df[field].isnull().mean() * 100
+    assert missing_pct == 0, f"{field} has {missing_pct:.1f}% missing (required field)"
+
+# 5. CONSISTENCY VALIDATION
+assert (df['start_date'] <= df['end_date']).all(), "start_date after end_date found"
+
+# 6. REFERENTIAL INTEGRITY
+valid_categories = ['A', 'B', 'C']
+assert df['category'].isin(valid_categories).all(), "Invalid categories found"
+
+print("✓ All data validations passed")
+```
+
+### Code Validation Checklist
+
+- [ ] **Unit tests**: All functions have tests covering happy path, edge cases, errors
+- [ ] **Integration tests**: APIs, database interactions tested end-to-end
+- [ ] **Test coverage**: ≥80% coverage for critical paths
+- [ ] **Error handling**: All exceptions caught and handled gracefully
+- [ ] **Input validation**: All user inputs validated before processing
+- [ ] **Logging**: Key operations logged for debugging
+- [ ] **Documentation**: Functions have docstrings, README updated
+- [ ] **Performance**: No obvious performance bottlenecks (profiled if needed)
+- [ ] **Security**: No hardcoded secrets, SQL injection protected, XSS prevented
+
+### Model Validation Checklist
+
+- [ ] **Train/val/test split**: Data split before any preprocessing (no data leakage)
+- [ ] **Baseline model**: Simple baseline implemented for comparison
+- [ ] **Cross-validation**: k-fold CV performed (k≥5)
+- [ ] **Metrics**: Appropriate metrics chosen (accuracy, precision/recall, AUC, RMSE, etc.)
+- [ ] **Overfitting check**: Training vs validation performance compared
+- [ ] **Error analysis**: Failure modes analyzed, edge cases identified
+- [ ] **Fairness**: Model checked for bias across sensitive groups
+- [ ] **Interpretability**: Feature importance or SHAP values computed
+- [ ] **Robustness**: Model tested with perturbed inputs
+- [ ] **Monitoring**: Drift detection and performance tracking in place
+
+## Quality Checklist
+
+Before delivering, verify:
+
+**Scaffold Structure:**
+- [ ] Clear step-by-step process defined
+- [ ] Each step has concrete actions (not vague advice)
+- [ ] Validation checkpoints included
+- [ ] Expected outputs specified
+
+**Completeness:**
+- [ ] Covers all requirements from user's task
+- [ ] Includes example code/pseudocode where helpful
+- [ ] Anticipates edge cases and error conditions
+- [ ] Provides decision guidance (when to use which approach)
+
+**Clarity:**
+- [ ] Assumptions stated explicitly
+- [ ] Technical terms defined or illustrated
+- [ ] Success criteria clear
+- [ ] Next steps obvious
+
+**Actionability:**
+- [ ] User can execute scaffold without further guidance
+- [ ] Code snippets are runnable (or nearly runnable)
+- [ ] Gaps surfaced early (missing data, unclear requirements)
+- [ ] Includes validation/quality checks
+
+**Rubric Score:**
+- [ ] Self-assessed with rubric ≥ 3.5 average