Initial commit
This commit is contained in:
@@ -0,0 +1,314 @@
|
||||
{
|
||||
"criteria": [
|
||||
{
|
||||
"name": "Scaffold Structure Clarity",
|
||||
"description": "Is the scaffold structure clear, systematic, and easy to follow?",
|
||||
"scoring": {
|
||||
"1": "No clear structure. Random collection of steps/checks without logical flow.",
|
||||
"2": "Basic structure but steps are vague or out of order. User confused about what to do next.",
|
||||
"3": "Clear structure with defined steps. User can follow but may need clarification on some steps.",
|
||||
"4": "Well-organized structure with clear steps, checkpoints, and expected outputs at each stage.",
|
||||
"5": "Exemplary structure: systematic, numbered steps with clear inputs/outputs, decision points explicit."
|
||||
},
|
||||
"red_flags": [
|
||||
"Steps not numbered or sequenced",
|
||||
"No clear starting/ending point",
|
||||
"Validation steps missing",
|
||||
"User must guess what to do next"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Coverage Completeness",
|
||||
"description": "Does the scaffold cover all necessary aspects (happy path, edge cases, validation, etc.)?",
|
||||
"scoring": {
|
||||
"1": "Major gaps. Only covers happy path, ignores edge cases/errors/validation.",
|
||||
"2": "Partial coverage. Addresses main case but misses important edge cases or validation steps.",
|
||||
"3": "Adequate coverage. Main cases and some edge cases covered. Basic validation included.",
|
||||
"4": "Comprehensive coverage. Happy path, edge cases, error conditions, validation all included.",
|
||||
"5": "Exhaustive coverage. All cases, validation at each step, robustness checks, limitations documented."
|
||||
},
|
||||
"red_flags": [
|
||||
"TDD scaffold: No tests for edge cases or errors",
|
||||
"EDA scaffold: Missing data quality checks",
|
||||
"Statistical scaffold: No assumption checks",
|
||||
"Any scaffold: No validation step before delivering"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Technical Rigor",
|
||||
"description": "Is the approach technically sound with appropriate methods/tests?",
|
||||
"scoring": {
|
||||
"1": "Technically incorrect. Wrong methods, flawed logic, or inappropriate techniques.",
|
||||
"2": "Questionable rigor. Some techniques correct but others questionable or missing justification.",
|
||||
"3": "Adequate rigor. Standard techniques applied correctly. Acceptable for routine work.",
|
||||
"4": "High rigor. Appropriate methods, assumptions checked, sensitivity analysis included.",
|
||||
"5": "Exemplary rigor. Best practices followed, multiple validation approaches, limitations acknowledged."
|
||||
},
|
||||
"red_flags": [
|
||||
"Causal inference without DAG or identification strategy",
|
||||
"Statistical test without checking assumptions",
|
||||
"ML model without train/val/test split (data leakage)",
|
||||
"TDD without testing error conditions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Actionability",
|
||||
"description": "Can user execute scaffold without further guidance? Are examples concrete?",
|
||||
"scoring": {
|
||||
"1": "Not actionable. Vague advice, no concrete steps, no code examples.",
|
||||
"2": "Somewhat actionable. General direction but user needs to figure out details.",
|
||||
"3": "Actionable. Clear steps with code snippets. User can execute with minor adjustments.",
|
||||
"4": "Highly actionable. Complete code examples, data assumptions stated, ready to adapt.",
|
||||
"5": "Immediately executable. Copy-paste ready examples with inline comments, expected outputs shown."
|
||||
},
|
||||
"red_flags": [
|
||||
"No code examples (just prose descriptions)",
|
||||
"Code has placeholders without explaining what to fill in",
|
||||
"No example inputs/outputs",
|
||||
"Vague instructions ('check assumptions', 'validate results' without saying how)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Test Quality (for TDD)",
|
||||
"description": "For TDD scaffolds: Do tests cover happy path, edge cases, errors, and integration?",
|
||||
"scoring": {
|
||||
"1": "Only happy path tests. No edge cases, errors, or integration tests.",
|
||||
"2": "Happy path + some edge cases. Error handling or integration missing.",
|
||||
"3": "Happy path, edge cases, basic error tests. Integration tests may be missing.",
|
||||
"4": "Comprehensive: Happy path, edge cases, error conditions, integration tests all present.",
|
||||
"5": "Exemplary: Above + property-based tests, test fixtures, mocks for external dependencies."
|
||||
},
|
||||
"red_flags": [
|
||||
"No tests for None/empty input",
|
||||
"No tests for expected exceptions",
|
||||
"No tests for state changes/side effects",
|
||||
"No integration tests for external systems"
|
||||
],
|
||||
"applicable_to": ["TDD"]
|
||||
},
|
||||
{
|
||||
"name": "Data Quality Assessment (for EDA)",
|
||||
"description": "For EDA scaffolds: Are data quality checks (missing, duplicates, outliers, consistency) included?",
|
||||
"scoring": {
|
||||
"1": "No data quality checks. Jumps straight to analysis without inspecting data.",
|
||||
"2": "Minimal checks. Maybe checks missing values but ignores duplicates, outliers, consistency.",
|
||||
"3": "Basic quality checks. Missing values, duplicates, basic outliers checked.",
|
||||
"4": "Thorough quality checks. Missing patterns, duplicates, outliers, type consistency, referential integrity.",
|
||||
"5": "Comprehensive quality framework. All checks + distributions, cardinality, data lineage, validation rules."
|
||||
},
|
||||
"red_flags": [
|
||||
"No check for missing values",
|
||||
"No check for duplicates",
|
||||
"No outlier detection",
|
||||
"Assumes data is clean without validation"
|
||||
],
|
||||
"applicable_to": ["EDA", "Statistical Analysis", "Predictive Modeling"]
|
||||
},
|
||||
{
|
||||
"name": "Assumption Documentation",
|
||||
"description": "Are assumptions explicitly stated and justified?",
|
||||
"scoring": {
|
||||
"1": "No assumptions stated. User unaware of what's being assumed.",
|
||||
"2": "Some assumptions implicit but not documented. User must infer them.",
|
||||
"3": "Key assumptions stated but not justified or validated.",
|
||||
"4": "Assumptions explicitly stated with justification. User knows what's assumed and why.",
|
||||
"5": "Assumptions stated, justified, validated where possible, and sensitivity to violations analyzed."
|
||||
},
|
||||
"red_flags": [
|
||||
"Statistical test applied without stating/checking assumptions",
|
||||
"Causal claim without stating identification assumptions",
|
||||
"ML model without documenting train/test split assumptions",
|
||||
"Function implementation without stating preconditions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Validation Steps Included",
|
||||
"description": "Does scaffold include validation/quality checks before delivering results?",
|
||||
"scoring": {
|
||||
"1": "No validation. Results delivered without any quality checks.",
|
||||
"2": "Informal validation. 'Looks good' without systematic checks.",
|
||||
"3": "Basic validation. Some checks but not comprehensive or systematic.",
|
||||
"4": "Systematic validation. Checklist of quality criteria, most items checked.",
|
||||
"5": "Rigorous validation framework. Multiple validation approaches, robustness checks, edge cases tested."
|
||||
},
|
||||
"red_flags": [
|
||||
"No validation step in workflow",
|
||||
"No rubric or checklist to assess quality",
|
||||
"No test suite execution before delivering code",
|
||||
"No sensitivity analysis for statistical results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Code/Analysis Quality",
|
||||
"description": "Is code well-structured, readable, and following best practices?",
|
||||
"scoring": {
|
||||
"1": "Poor quality. Spaghetti code, no structure, hard to understand.",
|
||||
"2": "Low quality. Works but hard to read, poor naming, no comments.",
|
||||
"3": "Adequate quality. Readable, basic structure, some comments. Acceptable for prototypes.",
|
||||
"4": "Good quality. Clean code, good naming, appropriate comments, follows style guide.",
|
||||
"5": "Excellent quality. Modular, DRY, well-documented, type hints, follows SOLID principles."
|
||||
},
|
||||
"red_flags": [
|
||||
"Magic numbers without explanation",
|
||||
"Copy-pasted code (not DRY)",
|
||||
"Functions doing multiple unrelated things",
|
||||
"No docstrings or comments explaining complex logic"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Reproducibility",
|
||||
"description": "Can another person reproduce the analysis/tests with provided information?",
|
||||
"scoring": {
|
||||
"1": "Not reproducible. Missing critical information (data, packages, random seeds).",
|
||||
"2": "Partially reproducible. Some information provided but key details missing.",
|
||||
"3": "Mostly reproducible. Enough information for skilled practitioner to reproduce with effort.",
|
||||
"4": "Reproducible. All information provided (data access, package versions, random seeds, parameters).",
|
||||
"5": "Fully reproducible. Documented environment, requirements.txt, Docker container, or notebook with all steps."
|
||||
},
|
||||
"red_flags": [
|
||||
"No package versions specified",
|
||||
"Random operations without setting seed",
|
||||
"Data source not documented or inaccessible",
|
||||
"No instructions for running tests/analysis"
|
||||
]
|
||||
}
|
||||
],
|
||||
"task_type_guidance": {
|
||||
"TDD": {
|
||||
"description": "Test-Driven Development scaffolds",
|
||||
"focus_criteria": [
|
||||
"Test Quality",
|
||||
"Code/Analysis Quality",
|
||||
"Validation Steps Included"
|
||||
],
|
||||
"target_score": 3.5,
|
||||
"success_indicators": [
|
||||
"Tests written before implementation",
|
||||
"Happy path, edge cases, errors all tested",
|
||||
"Tests pass and are maintainable",
|
||||
"Red-Green-Refactor cycle followed"
|
||||
]
|
||||
},
|
||||
"EDA": {
|
||||
"description": "Exploratory Data Analysis scaffolds",
|
||||
"focus_criteria": [
|
||||
"Data Quality Assessment",
|
||||
"Coverage Completeness",
|
||||
"Assumption Documentation"
|
||||
],
|
||||
"target_score": 3.5,
|
||||
"success_indicators": [
|
||||
"Data quality systematically checked",
|
||||
"Univariate and bivariate analysis completed",
|
||||
"Insights and recommendations documented",
|
||||
"Missing values, outliers, distributions analyzed"
|
||||
]
|
||||
},
|
||||
"Statistical Analysis": {
|
||||
"description": "Hypothesis testing, A/B tests, causal inference",
|
||||
"focus_criteria": [
|
||||
"Technical Rigor",
|
||||
"Assumption Documentation",
|
||||
"Validation Steps Included"
|
||||
],
|
||||
"target_score": 4.0,
|
||||
"success_indicators": [
|
||||
"Hypotheses clearly stated",
|
||||
"Appropriate test selected and justified",
|
||||
"Assumptions checked (normality, independence, etc.)",
|
||||
"Effect sizes and confidence intervals reported",
|
||||
"Sensitivity analysis performed"
|
||||
]
|
||||
},
|
||||
"Predictive Modeling": {
|
||||
"description": "ML model building and evaluation",
|
||||
"focus_criteria": [
|
||||
"Technical Rigor",
|
||||
"Validation Steps Included",
|
||||
"Reproducibility"
|
||||
],
|
||||
"target_score": 4.0,
|
||||
"success_indicators": [
|
||||
"Train/val/test split before preprocessing (no data leakage)",
|
||||
"Baseline model for comparison",
|
||||
"Cross-validation performed",
|
||||
"Error analysis and feature importance computed",
|
||||
"Model deployment checklist completed"
|
||||
]
|
||||
},
|
||||
"Validation": {
|
||||
"description": "Data/code/model quality checks",
|
||||
"focus_criteria": [
|
||||
"Coverage Completeness",
|
||||
"Validation Steps Included",
|
||||
"Technical Rigor"
|
||||
],
|
||||
"target_score": 4.0,
|
||||
"success_indicators": [
|
||||
"Schema validation (types, ranges, constraints)",
|
||||
"Referential integrity checked",
|
||||
"Edge cases tested",
|
||||
"Monitoring/alerting strategy defined"
|
||||
]
|
||||
}
|
||||
},
|
||||
"common_failure_modes": [
|
||||
{
|
||||
"failure_mode": "Jumping to Implementation Without Scaffold",
|
||||
"symptoms": "User writes code/analysis immediately without planning structure first.",
|
||||
"consequences": "Missing edge cases, poor test coverage, incomplete analysis.",
|
||||
"fix": "Force scaffold creation before implementation. Use template as checklist."
|
||||
},
|
||||
{
|
||||
"failure_mode": "Testing Only Happy Path",
|
||||
"symptoms": "TDD scaffold has tests for expected usage but none for errors/edge cases.",
|
||||
"consequences": "Code breaks in production on unexpected inputs.",
|
||||
"fix": "Require tests for: empty input, None, boundary values, invalid types, expected exceptions."
|
||||
},
|
||||
{
|
||||
"failure_mode": "Skipping Data Quality Checks",
|
||||
"symptoms": "EDA scaffold jumps to visualization without checking missing values, outliers, duplicates.",
|
||||
"consequences": "Invalid conclusions based on dirty data.",
|
||||
"fix": "Mandatory data quality section before any analysis. No exceptions."
|
||||
},
|
||||
{
|
||||
"failure_mode": "Assumptions Not Documented",
|
||||
"symptoms": "Statistical test applied without stating/checking assumptions (normality, independence, etc.).",
|
||||
"consequences": "Invalid statistical inference. Wrong conclusions.",
|
||||
"fix": "Explicit assumption section in scaffold. Check assumptions before applying test."
|
||||
},
|
||||
{
|
||||
"failure_mode": "No Validation Step",
|
||||
"symptoms": "Scaffold delivers results without any quality check or self-assessment.",
|
||||
"consequences": "Low-quality outputs, errors not caught.",
|
||||
"fix": "Mandatory validation step in workflow. Use rubric self-assessment."
|
||||
},
|
||||
{
|
||||
"failure_mode": "Correlation Interpreted as Causation",
|
||||
"symptoms": "EDA finds correlation, claims causal relationship without causal inference methods.",
|
||||
"consequences": "Wrong business decisions based on spurious causality.",
|
||||
"fix": "Distinguish predictive (correlation) from causal questions. Use causal inference methodology if claiming causation."
|
||||
},
|
||||
{
|
||||
"failure_mode": "Data Leakage in ML",
|
||||
"symptoms": "Preprocessing (scaling, imputation) done before train/test split.",
|
||||
"consequences": "Overly optimistic model performance. Fails in production.",
|
||||
"fix": "Scaffold enforces: split first, then preprocess. Fit transformers on train only."
|
||||
},
|
||||
{
|
||||
"failure_mode": "Code Without Tests",
|
||||
"symptoms": "Implementation provided but no test scaffold or test execution.",
|
||||
"consequences": "Regressions not caught, bugs in production.",
|
||||
"fix": "TDD scaffold mandatory for production code. Tests must pass before code review."
|
||||
}
|
||||
],
|
||||
"scale": 5,
|
||||
"minimum_average_score": 3.5,
|
||||
"interpretation": {
|
||||
"1.0-2.0": "Inadequate. Major gaps in structure, coverage, or rigor. Do not use. Revise scaffold.",
|
||||
"2.0-3.0": "Needs improvement. Basic structure present but incomplete or lacks rigor. Acceptable for learning/practice only.",
|
||||
"3.0-3.5": "Acceptable. Covers main cases with adequate rigor. Suitable for routine work or prototypes.",
|
||||
"3.5-4.0": "Good. Comprehensive coverage with good rigor. Suitable for production code/analysis.",
|
||||
"4.0-5.0": "Excellent. Exemplary structure, rigor, and completeness. Production-ready with best practices."
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,272 @@
|
||||
# EDA Example: Customer Churn Analysis
|
||||
|
||||
Complete exploratory data analysis for telecom customer churn dataset.
|
||||
|
||||
## Task
|
||||
|
||||
Explore customer churn dataset to understand:
|
||||
- What factors correlate with churn?
|
||||
- Are there data quality issues?
|
||||
- What features should we engineer for predictive model?
|
||||
|
||||
## Dataset
|
||||
|
||||
- **Rows**: 7,043 customers
|
||||
- **Target**: `Churn` (Yes/No)
|
||||
- **Features**: 20 columns (demographics, account info, usage patterns)
|
||||
|
||||
## EDA Scaffold Applied
|
||||
|
||||
### 1. Data Overview
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
df = pd.read_csv('telecom_churn.csv')
|
||||
|
||||
print(f"Shape: {df.shape}")
|
||||
# Output: (7043, 21)
|
||||
|
||||
print(f"Columns: {df.columns.tolist()}")
|
||||
# ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
|
||||
# 'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
|
||||
# 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
|
||||
# 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
|
||||
# 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
|
||||
|
||||
print(df.dtypes)
|
||||
# customerID object
|
||||
# gender object
|
||||
# SeniorCitizen int64
|
||||
# tenure int64
|
||||
# MonthlyCharges float64
|
||||
# TotalCharges object ← Should be numeric!
|
||||
# Churn object
|
||||
|
||||
print(df.head())
|
||||
print(df.describe())
|
||||
```
|
||||
|
||||
**Findings**:
|
||||
- TotalCharges is object type (should be numeric) - needs fixing
|
||||
- Churn is target variable (26.5% churn rate)
|
||||
|
||||
### 2. Data Quality Checks
|
||||
|
||||
```python
|
||||
# Missing values
|
||||
missing = df.isnull().sum()
|
||||
missing_pct = (missing / len(df)) * 100
|
||||
print(missing_pct[missing_pct > 0])
|
||||
# No missing values marked as NaN
|
||||
|
||||
# But TotalCharges is object - check for empty strings
|
||||
print((df['TotalCharges'] == ' ').sum())
|
||||
# Output: 11 rows have space instead of number
|
||||
|
||||
# Fix: Convert TotalCharges to numeric
|
||||
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
|
||||
print(df['TotalCharges'].isnull().sum())
|
||||
# Output: 11 (now properly marked as missing)
|
||||
|
||||
# Strategy: Drop 11 rows (< 0.2% of data)
|
||||
df = df.dropna()
|
||||
|
||||
# Duplicates
|
||||
print(f"Duplicates: {df.duplicated().sum()}")
|
||||
# Output: 0
|
||||
|
||||
# Data consistency checks
|
||||
print("Tenure vs TotalCharges consistency:")
|
||||
print(df[['tenure', 'MonthlyCharges', 'TotalCharges']].head())
|
||||
# tenure=1, Monthly=$29, Total=$29 ✓
|
||||
# tenure=34, Monthly=$57, Total=$1889 ≈ $57*34 ✓
|
||||
```
|
||||
|
||||
**Findings**:
|
||||
- 11 rows (0.16%) with missing TotalCharges - dropped
|
||||
- No duplicates
|
||||
- TotalCharges ≈ MonthlyCharges × tenure (consistent)
|
||||
|
||||
### 3. Univariate Analysis
|
||||
|
||||
```python
|
||||
# Target variable
|
||||
print(df['Churn'].value_counts(normalize=True))
|
||||
# No 73.5%
|
||||
# Yes 26.5%
|
||||
|
||||
# Imbalanced but not severely (>20% minority class is workable)
|
||||
|
||||
# Numeric variables
|
||||
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
|
||||
for col in numeric_cols:
|
||||
print(f"\n{col}:")
|
||||
print(f" Mean: {df[col].mean():.2f}, Median: {df[col].median():.2f}")
|
||||
print(f" Std: {df[col].std():.2f}, Range: [{df[col].min()}, {df[col].max()}]")
|
||||
|
||||
# Histogram
|
||||
df[col].hist(bins=50, edgecolor='black')
|
||||
plt.title(f'{col} Distribution')
|
||||
plt.xlabel(col)
|
||||
plt.show()
|
||||
|
||||
# Check outliers
|
||||
Q1, Q3 = df[col].quantile([0.25, 0.75])
|
||||
IQR = Q3 - Q1
|
||||
outliers = ((df[col] < (Q1 - 1.5*IQR)) | (df[col] > (Q3 + 1.5*IQR))).sum()
|
||||
print(f" Outliers: {outliers} ({outliers/len(df)*100:.1f}%)")
|
||||
```
|
||||
|
||||
**Findings**:
|
||||
- **tenure**: Right-skewed (mean=32, median=29). Many new customers (0-12 months).
|
||||
- **MonthlyCharges**: Bimodal distribution (peaks at ~$20 and ~$80). Suggests customer segments.
|
||||
- **TotalCharges**: Right-skewed (correlated with tenure). Few outliers (2.3%).
|
||||
|
||||
```python
|
||||
# Categorical variables
|
||||
cat_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Contract', 'PaymentMethod']
|
||||
for col in cat_cols:
|
||||
print(f"\n{col}: {df[col].nunique()} unique values")
|
||||
print(df[col].value_counts())
|
||||
|
||||
# Bar plot
|
||||
df[col].value_counts().plot(kind='bar')
|
||||
plt.title(f'{col} Distribution')
|
||||
plt.xticks(rotation=45)
|
||||
plt.show()
|
||||
```
|
||||
|
||||
**Findings**:
|
||||
- **gender**: Balanced (50/50 male/female)
|
||||
- **SeniorCitizen**: 16% are senior citizens
|
||||
- **Contract**: 55% month-to-month, 24% one-year, 21% two-year
|
||||
- **PaymentMethod**: Electronic check most common (34%)
|
||||
|
||||
### 4. Bivariate Analysis (Churn vs Features)
|
||||
|
||||
```python
|
||||
# Churn rate by categorical variables
|
||||
for col in cat_cols:
|
||||
churn_rate = df.groupby(col)['Churn'].apply(lambda x: (x=='Yes').mean())
|
||||
print(f"\n{col} vs Churn:")
|
||||
print(churn_rate.sort_values(ascending=False))
|
||||
|
||||
# Stacked bar chart
|
||||
pd.crosstab(df[col], df['Churn'], normalize='index').plot(kind='bar', stacked=True)
|
||||
plt.title(f'Churn Rate by {col}')
|
||||
plt.ylabel('Proportion')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
**Key Findings**:
|
||||
- **Contract**: Month-to-month churn=42.7%, One-year=11.3%, Two-year=2.8% (Strong signal!)
|
||||
- **SeniorCitizen**: Seniors churn=41.7%, Non-seniors=23.6%
|
||||
- **PaymentMethod**: Electronic check=45.3% churn, others~15-18%
|
||||
- **tenure**: Customers with tenure<12 months churn=47.5%, >60 months=7.9%
|
||||
|
||||
```python
|
||||
# Numeric variables vs Churn
|
||||
for col in numeric_cols:
|
||||
plt.figure(figsize=(10, 4))
|
||||
|
||||
# Box plot
|
||||
plt.subplot(1, 2, 1)
|
||||
df.boxplot(column=col, by='Churn')
|
||||
plt.title(f'{col} by Churn')
|
||||
|
||||
# Histogram (overlay)
|
||||
plt.subplot(1, 2, 2)
|
||||
df[df['Churn']=='No'][col].hist(bins=30, alpha=0.5, label='No Churn', density=True)
|
||||
df[df['Churn']=='Yes'][col].hist(bins=30, alpha=0.5, label='Churn', density=True)
|
||||
plt.legend()
|
||||
plt.xlabel(col)
|
||||
plt.title(f'{col} Distribution by Churn')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
**Key Findings**:
|
||||
- **tenure**: Churned customers have lower tenure (mean=18 vs 38 months)
|
||||
- **MonthlyCharges**: Churned customers pay MORE ($74 vs $61/month)
|
||||
- **TotalCharges**: Churned customers have lower total (correlated with tenure)
|
||||
|
||||
```python
|
||||
# Correlation matrix
|
||||
numeric_df = df[['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen']].copy()
|
||||
numeric_df['Churn_binary'] = (df['Churn'] == 'Yes').astype(int)
|
||||
|
||||
corr = numeric_df.corr()
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
|
||||
plt.title('Correlation Matrix')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
**Key Findings**:
|
||||
- tenure ↔ TotalCharges: 0.83 (strong positive correlation - expected)
|
||||
- Churn ↔ tenure: -0.35 (negative: longer tenure → less churn)
|
||||
- Churn ↔ MonthlyCharges: +0.19 (positive: higher charges → more churn)
|
||||
- Churn ↔ TotalCharges: -0.20 (negative: driven by tenure)
|
||||
|
||||
### 5. Insights & Recommendations
|
||||
|
||||
```python
|
||||
print("\n=== KEY FINDINGS ===")
|
||||
print("1. Data Quality:")
|
||||
print(" - 11 rows (<0.2%) dropped due to missing TotalCharges")
|
||||
print(" - No other quality issues. Data is clean.")
|
||||
print("")
|
||||
print("2. Churn Patterns:")
|
||||
print(" - Overall churn rate: 26.5% (slightly imbalanced)")
|
||||
print(" - Strongest predictor: Contract type (month-to-month 42.7% vs two-year 2.8%)")
|
||||
print(" - High-risk segment: New customers (<12mo tenure) with high monthly charges")
|
||||
print(" - Low churn: Long-term customers (>60mo) on two-year contracts")
|
||||
print("")
|
||||
print("3. Feature Importance:")
|
||||
print(" - **High signal**: Contract, tenure, PaymentMethod, SeniorCitizen")
|
||||
print(" - **Medium signal**: MonthlyCharges, InternetService")
|
||||
print(" - **Low signal**: gender, PhoneService (balanced across churn/no-churn)")
|
||||
print("")
|
||||
print("\n=== RECOMMENDED ACTIONS ===")
|
||||
print("1. Feature Engineering:")
|
||||
print(" - Create 'tenure_bucket' (0-12mo, 12-24mo, 24-60mo, >60mo)")
|
||||
print(" - Create 'high_charges' flag (MonthlyCharges > $70)")
|
||||
print(" - Interaction: tenure × Contract (captures switching cost)")
|
||||
print(" - Payment risk score (Electronic check is risky)")
|
||||
print("")
|
||||
print("2. Model Strategy:")
|
||||
print(" - Use all categorical features (one-hot encode)")
|
||||
print(" - Baseline: Predict churn for month-to-month + new customers")
|
||||
print(" - Advanced: Random Forest or Gradient Boosting (handle interactions)")
|
||||
print(" - Validate with stratified 5-fold CV (preserve 26.5% churn rate)")
|
||||
print("")
|
||||
print("3. Business Insights:")
|
||||
print(" - **Retention program**: Target month-to-month customers < 12mo tenure")
|
||||
print(" - **Contract incentives**: Offer discounts for one/two-year contracts")
|
||||
print(" - **Payment method**: Encourage auto-pay (reduce electronic check)")
|
||||
print(" - **Early warning**: Monitor customers with high MonthlyCharges + short tenure")
|
||||
```
|
||||
|
||||
### 6. Self-Assessment
|
||||
|
||||
Using rubric:
|
||||
|
||||
- **Clarity** (5/5): Systematic exploration, clear findings at each stage
|
||||
- **Completeness** (5/5): Data quality, univariate, bivariate, insights all covered
|
||||
- **Rigor** (5/5): Proper statistical analysis, visualizations, quantified relationships
|
||||
- **Actionability** (5/5): Specific feature engineering and business recommendations
|
||||
|
||||
**Average**: 5.0/5 ✓
|
||||
|
||||
This EDA provides solid foundation for predictive modeling and business action.
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Feature engineering**: Implement recommended features
|
||||
2. **Baseline model**: Logistic regression with top 5 features
|
||||
3. **Advanced models**: Random Forest, XGBoost with feature interactions
|
||||
4. **Evaluation**: F1-score, precision/recall curves, AUC-ROC
|
||||
5. **Deployment**: Real-time churn scoring API
|
||||
@@ -0,0 +1,226 @@
|
||||
# TDD Example: User Authentication
|
||||
|
||||
Complete TDD example showing test-first development for authentication function.
|
||||
|
||||
## Task
|
||||
|
||||
Build a `validate_login(username, password)` function that:
|
||||
- Returns `True` for valid credentials
|
||||
- Returns `False` for invalid password
|
||||
- Raises `ValueError` for missing username/password
|
||||
- Raises `User Not FoundError` for nonexistent users
|
||||
- Logs failed attempts
|
||||
|
||||
## Step 1: Write Tests FIRST
|
||||
|
||||
```python
|
||||
# test_auth.py
|
||||
import pytest
|
||||
from auth import validate_login, UserNotFoundError
|
||||
|
||||
# HAPPY PATH
|
||||
def test_valid_credentials():
|
||||
"""User with correct password should authenticate"""
|
||||
assert validate_login("alice@example.com", "SecurePass123!") == True
|
||||
|
||||
# EDGE CASES
|
||||
def test_empty_username():
|
||||
"""Empty username should raise ValueError"""
|
||||
with pytest.raises(ValueError, match="Username required"):
|
||||
validate_login("", "password")
|
||||
|
||||
def test_empty_password():
|
||||
"""Empty password should raise ValueError"""
|
||||
with pytest.raises(ValueError, match="Password required"):
|
||||
validate_login("alice@example.com", "")
|
||||
|
||||
def test_none_credentials():
|
||||
"""None values should raise ValueError"""
|
||||
with pytest.raises(ValueError):
|
||||
validate_login(None, None)
|
||||
|
||||
# ERROR CONDITIONS
|
||||
def test_invalid_password():
|
||||
"""Wrong password should return False"""
|
||||
assert validate_login("alice@example.com", "WrongPassword") == False
|
||||
|
||||
def test_nonexistent_user():
|
||||
"""User not in database should raise UserNotFoundError"""
|
||||
with pytest.raises(UserNotFoundError):
|
||||
validate_login("nobody@example.com", "anypassword")
|
||||
|
||||
def test_case_sensitive_password():
|
||||
"""Password check should be case-sensitive"""
|
||||
assert validate_login("alice@example.com", "securepass123!") == False
|
||||
|
||||
# STATE/SIDE EFFECTS
|
||||
def test_failed_attempt_logged(caplog):
|
||||
"""Failed login should be logged"""
|
||||
validate_login("alice@example.com", "WrongPassword")
|
||||
assert "Failed login attempt" in caplog.text
|
||||
assert "alice@example.com" in caplog.text
|
||||
|
||||
def test_successful_login_logged(caplog):
|
||||
"""Successful login should be logged"""
|
||||
validate_login("alice@example.com", "SecurePass123!")
|
||||
assert "Successful login" in caplog.text
|
||||
|
||||
# INTEGRATION TEST
|
||||
@pytest.fixture
|
||||
def mock_database():
|
||||
"""Mock database with test users"""
|
||||
return {
|
||||
"alice@example.com": {
|
||||
"password_hash": "hashed_SecurePass123!",
|
||||
"salt": "random_salt_123"
|
||||
}
|
||||
}
|
||||
|
||||
def test_database_integration(mock_database, monkeypatch):
|
||||
"""Function should query database correctly"""
|
||||
def mock_get_user(username):
|
||||
return mock_database.get(username)
|
||||
|
||||
monkeypatch.setattr("auth.get_user_from_db", mock_get_user)
|
||||
result = validate_login("alice@example.com", "SecurePass123!")
|
||||
assert result == True
|
||||
```
|
||||
|
||||
## Step 2: Run Tests (They Should FAIL - Red)
|
||||
|
||||
```bash
|
||||
$ pytest test_auth.py
|
||||
FAILED - ModuleNotFoundError: No module named 'auth'
|
||||
```
|
||||
|
||||
## Step 3: Write Minimal Implementation (Green)
|
||||
|
||||
```python
|
||||
# auth.py
|
||||
import logging
|
||||
import hashlib
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class UserNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
def validate_login(username, password):
|
||||
# Input validation
|
||||
if not username:
|
||||
raise ValueError("Username required")
|
||||
if not password:
|
||||
raise ValueError("Password required")
|
||||
|
||||
# Get user from database
|
||||
user = get_user_from_db(username)
|
||||
if user is None:
|
||||
raise UserNotFoundError(f"User {username} not found")
|
||||
|
||||
# Hash password and compare
|
||||
password_hash = hash_password(password, user['salt'])
|
||||
is_valid = (password_hash == user['password_hash'])
|
||||
|
||||
# Log attempt
|
||||
if is_valid:
|
||||
logger.info(f"Successful login for {username}")
|
||||
else:
|
||||
logger.warning(f"Failed login attempt for {username}")
|
||||
|
||||
return is_valid
|
||||
|
||||
def get_user_from_db(username):
|
||||
# Stub - implement database query
|
||||
users = {
|
||||
"alice@example.com": {
|
||||
"password_hash": hash_password("SecurePass123!", "random_salt_123"),
|
||||
"salt": "random_salt_123"
|
||||
}
|
||||
}
|
||||
return users.get(username)
|
||||
|
||||
def hash_password(password, salt):
|
||||
# Simplified - use bcrypt/argon2 in production
|
||||
return hashlib.sha256(f"{password}{salt}".encode()).hexdigest()
|
||||
```
|
||||
|
||||
## Step 4: Run Tests Again (Should PASS - Green)
|
||||
|
||||
```bash
|
||||
$ pytest test_auth.py -v
|
||||
test_valid_credentials PASSED
|
||||
test_empty_username PASSED
|
||||
test_empty_password PASSED
|
||||
test_none_credentials PASSED
|
||||
test_invalid_password PASSED
|
||||
test_nonexistent_user PASSED
|
||||
test_case_sensitive_password PASSED
|
||||
test_failed_attempt_logged PASSED
|
||||
test_successful_login_logged PASSED
|
||||
test_database_integration PASSED
|
||||
|
||||
========== 10 passed in 0.15s ==========
|
||||
```
|
||||
|
||||
## Step 5: Refactor (Keep Tests Green)
|
||||
|
||||
```python
|
||||
# auth.py (refactored for readability)
|
||||
class AuthenticationService:
|
||||
def __init__(self, user_repo, password_hasher):
|
||||
self.user_repo = user_repo
|
||||
self.password_hasher = password_hasher
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def validate_login(self, username, password):
|
||||
self._validate_inputs(username, password)
|
||||
user = self._get_user(username)
|
||||
is_valid = self._check_password(password, user)
|
||||
self._log_attempt(username, is_valid)
|
||||
return is_valid
|
||||
|
||||
def _validate_inputs(self, username, password):
|
||||
if not username:
|
||||
raise ValueError("Username required")
|
||||
if not password:
|
||||
raise ValueError("Password required")
|
||||
|
||||
def _get_user(self, username):
|
||||
user = self.user_repo.get_by_username(username)
|
||||
if user is None:
|
||||
raise UserNotFoundError(f"User {username} not found")
|
||||
return user
|
||||
|
||||
def _check_password(self, password, user):
|
||||
password_hash = self.password_hasher.hash(password, user.salt)
|
||||
return password_hash == user.password_hash
|
||||
|
||||
def _log_attempt(self, username, is_valid):
|
||||
if is_valid:
|
||||
self.logger.info(f"Successful login for {username}")
|
||||
else:
|
||||
self.logger.warning(f"Failed login attempt for {username}")
|
||||
```
|
||||
|
||||
Tests still pass after refactoring!
|
||||
|
||||
## Key Takeaways
|
||||
|
||||
1. **Tests written FIRST** define expected behavior
|
||||
2. **Minimal implementation** to make tests pass
|
||||
3. **Refactor** with confidence (tests catch regressions)
|
||||
4. **Comprehensive coverage**: happy path, edge cases, errors, side effects, integration
|
||||
5. **Fast feedback**: Know immediately if something breaks
|
||||
|
||||
## Self-Assessment
|
||||
|
||||
Using rubric:
|
||||
|
||||
- **Clarity** (5/5): Requirements clearly defined by tests
|
||||
- **Completeness** (5/5): All cases covered (happy, edge, error, integration)
|
||||
- **Rigor** (5/5): TDD cycle followed (Red → Green → Refactor)
|
||||
- **Actionability** (5/5): Tests are executable specification
|
||||
|
||||
**Average**: 5.0/5 ✓
|
||||
|
||||
This is production-ready test-first code.
|
||||
470
skills/code-data-analysis-scaffolds/resources/methodology.md
Normal file
470
skills/code-data-analysis-scaffolds/resources/methodology.md
Normal file
@@ -0,0 +1,470 @@
|
||||
# Code Data Analysis Scaffolds Methodology
|
||||
|
||||
Advanced techniques for causal inference, predictive modeling, property-based testing, and complex data analysis.
|
||||
|
||||
## Workflow
|
||||
|
||||
Copy this checklist and track your progress:
|
||||
|
||||
```
|
||||
Code Data Analysis Scaffolds Progress:
|
||||
- [ ] Step 1: Clarify task and objectives
|
||||
- [ ] Step 2: Choose appropriate scaffold type
|
||||
- [ ] Step 3: Generate scaffold structure
|
||||
- [ ] Step 4: Validate scaffold completeness
|
||||
- [ ] Step 5: Deliver scaffold and guide execution
|
||||
```
|
||||
|
||||
**Step 1: Clarify task** - Assess complexity and determine if advanced techniques needed. See [1. When to Use Advanced Methods](#1-when-to-use-advanced-methods).
|
||||
|
||||
**Step 2: Choose scaffold** - Select from Causal Inference, Predictive Modeling, Property-Based Testing, or Advanced EDA. See specific sections below.
|
||||
|
||||
**Step 3: Generate structure** - Apply advanced scaffold matching task complexity. See [2. Causal Inference Methods](#2-causal-inference-methods), [3. Predictive Modeling Pipeline](#3-predictive-modeling-pipeline), [4. Property-Based Testing](#4-property-based-testing), [5. Advanced EDA Techniques](#5-advanced-eda-techniques).
|
||||
|
||||
**Step 4: Validate** - Check assumptions, sensitivity analysis, robustness checks using [6. Advanced Validation Patterns](#6-advanced-validation-patterns).
|
||||
|
||||
**Step 5: Deliver** - Present with caveats, limitations, and recommendations for further analysis.
|
||||
|
||||
## 1. When to Use Advanced Methods
|
||||
|
||||
| Task Characteristic | Standard Template | Advanced Methodology |
|
||||
|---------------------|-------------------|---------------------|
|
||||
| **Causal question** | "Does X correlate with Y?" | "Does X cause Y?" → Causal inference needed |
|
||||
| **Sample size** | < 1000 rows | > 10K rows with complex patterns |
|
||||
| **Model complexity** | Linear/logistic regression | Ensemble methods, neural nets, feature interactions |
|
||||
| **Test sophistication** | Unit tests, integration tests | Property-based tests, mutation testing, fuzz testing |
|
||||
| **Data complexity** | Clean, tabular data | Multi-modal, high-dimensional, unstructured data |
|
||||
| **Stakes** | Low (exploratory) | High (production ML, regulatory compliance) |
|
||||
|
||||
## 2. Causal Inference Methods
|
||||
|
||||
Use when research question is "Does X **cause** Y?" not just "Are X and Y correlated?"
|
||||
|
||||
### Causal Inference Scaffold
|
||||
|
||||
```python
|
||||
# CAUSAL INFERENCE SCAFFOLD
|
||||
|
||||
# 1. DRAW CAUSAL DAG (Directed Acyclic Graph)
|
||||
# Explicitly model: Treatment → Outcome, Confounders → Treatment & Outcome
|
||||
#
|
||||
# Example:
|
||||
# Education → Income
|
||||
# ↑ ↑
|
||||
# Family Background
|
||||
#
|
||||
# Treatment: Education
|
||||
# Outcome: Income
|
||||
# Confounder: Family Background (affects both education and income)
|
||||
|
||||
# 2. IDENTIFY CONFOUNDERS
|
||||
confounders = ['age', 'gender', 'family_income', 'region']
|
||||
# These variables affect BOTH treatment and outcome
|
||||
# If not controlled, they bias causal estimate
|
||||
|
||||
# 3. CHECK IDENTIFICATION ASSUMPTIONS
|
||||
# For causal effect to be identifiable:
|
||||
# a) No unmeasured confounders (all variables in DAG observed)
|
||||
# b) Treatment assignment as-if random conditional on confounders
|
||||
# c) Positivity: Every unit has nonzero probability of treatment/control
|
||||
|
||||
# 4. CHOOSE IDENTIFICATION STRATEGY
|
||||
|
||||
# Option A: RCT - Random assignment eliminates confounding. Check balance on confounders.
|
||||
from scipy import stats
|
||||
for var in confounders:
|
||||
_, p = stats.ttest_ind(treatment_group[var], control_group[var])
|
||||
print(f"{var}: {'✓' if p > 0.05 else '✗'} balanced")
|
||||
|
||||
# Option B: Regression - Control for confounders. Assumes no unmeasured confounding.
|
||||
import statsmodels.formula.api as smf
|
||||
model = smf.ols('outcome ~ treatment + age + gender + family_income', data=df).fit()
|
||||
treatment_effect = model.params['treatment']
|
||||
|
||||
# Option C: Propensity Score Matching - Match treated to similar controls on P(treatment|X).
|
||||
from sklearn.linear_model import LogisticRegression; from sklearn.neighbors import NearestNeighbors
|
||||
ps_model = LogisticRegression().fit(df[confounders], df['treatment'])
|
||||
df['ps'] = ps_model.predict_proba(df[confounders])[:,1]
|
||||
treated, control = df[df['treatment']==1], df[df['treatment']==0]
|
||||
nn = NearestNeighbors(n_neighbors=1).fit(control[['ps']])
|
||||
_, indices = nn.kneighbors(treated[['ps']])
|
||||
treatment_effect = treated['outcome'].mean() - control.iloc[indices.flatten()]['outcome'].mean()
|
||||
|
||||
# Option D: IV - Need instrument Z: affects treatment, not outcome (except through treatment).
|
||||
from statsmodels.sandbox.regression.gmm import IV2SLS
|
||||
iv_model = IV2SLS(df['income'], df[['education'] + confounders], df[['instrument'] + confounders]).fit()
|
||||
|
||||
# Option E: RDD - Treatment assigned at cutoff. Compare units just above/below threshold.
|
||||
df['above_cutoff'] = (df['running_var'] >= cutoff).astype(int)
|
||||
# Use local linear regression around cutoff to estimate effect
|
||||
|
||||
# Option F: DiD - Compare treatment vs control, before vs after. Assumes parallel trends.
|
||||
t_before, t_after = df[(df['group']=='T') & (df['time']=='before')]['y'].mean(), df[(df['group']=='T') & (df['time']=='after')]['y'].mean()
|
||||
c_before, c_after = df[(df['group']=='C') & (df['time']=='before')]['y'].mean(), df[(df['group']=='C') & (df['time']=='after')]['y'].mean()
|
||||
did_estimate = (t_after - t_before) - (c_after - c_before)
|
||||
|
||||
# 5. SENSITIVITY ANALYSIS
|
||||
print("\n=== SENSITIVITY CHECKS ===")
|
||||
print("1. Unmeasured confounding: How strong would confounder need to be to change conclusion?")
|
||||
print("2. Placebo tests: Check for effect in period before treatment (should be zero)")
|
||||
print("3. Falsification tests: Check for effect on outcome that shouldn't be affected")
|
||||
print("4. Robustness: Try different model specifications, subsamples, bandwidths (RDD)")
|
||||
|
||||
# 6. REPORT CAUSAL ESTIMATE WITH UNCERTAINTY
|
||||
print(f"\nCausal Effect: {treatment_effect:.3f}")
|
||||
print(f"95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")
|
||||
print(f"Interpretation: Treatment X causes {treatment_effect:.1%} change in outcome Y")
|
||||
print(f"Assumptions: [List key identifying assumptions]")
|
||||
print(f"Limitations: [Threats to validity]")
|
||||
```
|
||||
|
||||
### Causal Inference Checklist
|
||||
|
||||
- [ ] **Causal question clearly stated**: "Does X cause Y?" not "Are X and Y related?"
|
||||
- [ ] **DAG drawn**: Treatment, outcome, confounders, mediators identified
|
||||
- [ ] **Identification strategy chosen**: RCT, regression, PS matching, IV, RDD, DiD
|
||||
- [ ] **Assumptions checked**: No unmeasured confounding, positivity, parallel trends (DiD), etc.
|
||||
- [ ] **Sensitivity analysis**: Test robustness to violations of assumptions
|
||||
- [ ] **Limitations acknowledged**: Threats to internal/external validity stated
|
||||
|
||||
## 3. Predictive Modeling Pipeline
|
||||
|
||||
Use for forecasting, classification, regression - when goal is prediction not causal understanding.
|
||||
|
||||
### Predictive Modeling Scaffold
|
||||
|
||||
```python
|
||||
# PREDICTIVE MODELING SCAFFOLD
|
||||
|
||||
# 1. DEFINE PREDICTION TASK & METRIC
|
||||
task = "Predict customer churn (binary classification)"
|
||||
primary_metric = "F1-score" # Balance precision/recall
|
||||
secondary_metrics = ["AUC-ROC", "precision", "recall", "accuracy"]
|
||||
|
||||
# 2. TRAIN/VAL/TEST SPLIT (before any preprocessing!)
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
# Split: 60% train, 20% validation, 20% test
|
||||
train_val, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])
|
||||
train, val = train_test_split(train_val, test_size=0.25, random_state=42, stratify=train_val['target'])
|
||||
|
||||
print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
|
||||
print(f"Class balance - Train: {train['target'].mean():.2%}, Test: {test['target'].mean():.2%}")
|
||||
|
||||
# 3. FEATURE ENGINEERING (fit on train, transform train/val/test)
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
# Numeric features: impute missing, standardize
|
||||
numeric_features = ['age', 'income', 'tenure']
|
||||
num_imputer = SimpleImputer(strategy='median').fit(train[numeric_features])
|
||||
num_scaler = StandardScaler().fit(num_imputer.transform(train[numeric_features]))
|
||||
|
||||
X_train_num = num_scaler.transform(num_imputer.transform(train[numeric_features]))
|
||||
X_val_num = num_scaler.transform(num_imputer.transform(val[numeric_features]))
|
||||
X_test_num = num_scaler.transform(num_imputer.transform(test[numeric_features]))
|
||||
|
||||
# Categorical features: one-hot encode
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
cat_features = ['region', 'product_type']
|
||||
cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False).fit(train[cat_features])
|
||||
|
||||
X_train_cat = cat_encoder.transform(train[cat_features])
|
||||
X_val_cat = cat_encoder.transform(val[cat_features])
|
||||
X_test_cat = cat_encoder.transform(test[cat_features])
|
||||
|
||||
# Combine features
|
||||
import numpy as np
|
||||
X_train = np.hstack([X_train_num, X_train_cat])
|
||||
X_val = np.hstack([X_val_num, X_val_cat])
|
||||
X_test = np.hstack([X_test_num, X_test_cat])
|
||||
y_train, y_val, y_test = train['target'], val['target'], test['target']
|
||||
|
||||
# 4. BASELINE MODEL (always start simple!)
|
||||
from sklearn.dummy import DummyClassifier
|
||||
baseline = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
|
||||
baseline_f1 = f1_score(y_val, baseline.predict(X_val))
|
||||
print(f"Baseline F1: {baseline_f1:.3f}")
|
||||
|
||||
# 5. MODEL SELECTION & HYPERPARAMETER TUNING
|
||||
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.metrics import f1_score, roc_auc_score
|
||||
|
||||
# Try multiple models
|
||||
models = {
|
||||
'Logistic Regression': LogisticRegression(max_iter=1000),
|
||||
'Random Forest': RandomForestClassifier(random_state=42),
|
||||
'Gradient Boosting': GradientBoostingClassifier(random_state=42)
|
||||
}
|
||||
|
||||
results = {}
|
||||
for name, model in models.items():
|
||||
model.fit(X_train, y_train)
|
||||
y_pred = model.predict(X_val)
|
||||
y_proba = model.predict_proba(X_val)[:,1]
|
||||
|
||||
results[name] = {
|
||||
'F1': f1_score(y_val, y_pred),
|
||||
'AUC': roc_auc_score(y_val, y_proba),
|
||||
'Precision': precision_score(y_val, y_pred),
|
||||
'Recall': recall_score(y_val, y_pred)
|
||||
}
|
||||
print(f"{name}: F1={results[name]['F1']:.3f}, AUC={results[name]['AUC']:.3f}")
|
||||
|
||||
# Select best model (highest F1 on validation)
|
||||
best_model_name = max(results, key=lambda x: results[x]['F1'])
|
||||
best_model = models[best_model_name]
|
||||
print(f"\nBest model: {best_model_name}")
|
||||
|
||||
# Hyperparameter tuning on best model
|
||||
if best_model_name == 'Random Forest':
|
||||
param_grid = {
|
||||
'n_estimators': [100, 200, 300],
|
||||
'max_depth': [10, 20, None],
|
||||
'min_samples_split': [2, 5, 10]
|
||||
}
|
||||
grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
|
||||
grid_search.fit(X_train, y_train)
|
||||
best_model = grid_search.best_estimator_
|
||||
print(f"Best params: {grid_search.best_params_}")
|
||||
|
||||
# 6. CROSS-VALIDATION (check for overfitting)
|
||||
from sklearn.model_selection import cross_val_score
|
||||
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1')
|
||||
print(f"CV F1 scores: {cv_scores}")
|
||||
print(f"Mean: {cv_scores.mean():.3f}, Std: {cv_scores.std():.3f}")
|
||||
|
||||
# 7. FINAL EVALUATION ON TEST SET (only once!)
|
||||
y_test_pred = best_model.predict(X_test)
|
||||
y_test_proba = best_model.predict_proba(X_test)[:,1]
|
||||
|
||||
test_f1 = f1_score(y_test, y_test_pred)
|
||||
test_auc = roc_auc_score(y_test, y_test_proba)
|
||||
print(f"\n=== FINAL TEST PERFORMANCE ===")
|
||||
print(f"F1: {test_f1:.3f}, AUC: {test_auc:.3f}")
|
||||
|
||||
# 8. ERROR ANALYSIS
|
||||
from sklearn.metrics import confusion_matrix, classification_report
|
||||
print("\nConfusion Matrix:")
|
||||
print(confusion_matrix(y_test, y_test_pred))
|
||||
print("\nClassification Report:")
|
||||
print(classification_report(y_test, y_test_pred))
|
||||
|
||||
# Analyze misclassifications
|
||||
test_df = test.copy()
|
||||
test_df['prediction'] = y_test_pred
|
||||
test_df['prediction_proba'] = y_test_proba
|
||||
false_positives = test_df[(test_df['target']==0) & (test_df['prediction']==1)]
|
||||
false_negatives = test_df[(test_df['target']==1) & (test_df['prediction']==0)]
|
||||
print(f"False Positives: {len(false_positives)}")
|
||||
print(f"False Negatives: {len(false_negatives)}")
|
||||
# Inspect these cases to understand failure modes
|
||||
|
||||
# 9. FEATURE IMPORTANCE
|
||||
if hasattr(best_model, 'feature_importances_'):
|
||||
feature_names = numeric_features + list(cat_encoder.get_feature_names_out(cat_features))
|
||||
importances = pd.DataFrame({
|
||||
'feature': feature_names,
|
||||
'importance': best_model.feature_importances_
|
||||
}).sort_values('importance', ascending=False)
|
||||
print("\nTop 10 Features:")
|
||||
print(importances.head(10))
|
||||
|
||||
# 10. MODEL DEPLOYMENT CHECKLIST
|
||||
print("\n=== DEPLOYMENT READINESS ===")
|
||||
print(f"✓ Test F1 ({test_f1:.3f}) > Baseline ({baseline_f1:.3f})")
|
||||
print(f"✓ Cross-validation shows consistent performance (CV std={cv_scores.std():.3f})")
|
||||
print("✓ Error analysis completed, failure modes understood")
|
||||
print("✓ Feature importance computed, no surprising features")
|
||||
print("□ Model serialized and saved")
|
||||
print("□ Monitoring plan in place (track drift in input features, output distribution)")
|
||||
print("□ Rollback plan if model underperforms in production")
|
||||
```
|
||||
|
||||
### Predictive Modeling Checklist
|
||||
|
||||
- [ ] **Clear prediction task**: Classification, regression, time series forecasting
|
||||
- [ ] **Appropriate metrics**: Match business objectives (precision vs recall tradeoff, etc.)
|
||||
- [ ] **Train/val/test split**: Before any preprocessing (no data leakage)
|
||||
- [ ] **Baseline model**: Simple model for comparison
|
||||
- [ ] **Feature engineering**: Proper handling of missing values, scaling, encoding
|
||||
- [ ] **Cross-validation**: k-fold CV to check for overfitting
|
||||
- [ ] **Model selection**: Compare multiple model types
|
||||
- [ ] **Hyperparameter tuning**: Grid/random search on validation set
|
||||
- [ ] **Error analysis**: Understand failure modes, inspect misclassifications
|
||||
- [ ] **Test set evaluation**: Final performance check (only once!)
|
||||
- [ ] **Deployment readiness**: Monitoring, rollback plan, model versioning
|
||||
|
||||
## 4. Property-Based Testing
|
||||
|
||||
Use for testing complex logic, data transformations, invariants. Goes beyond example-based tests.
|
||||
|
||||
### Property-Based Testing Scaffold
|
||||
|
||||
```python
|
||||
# PROPERTY-BASED TESTING SCAFFOLD
|
||||
from hypothesis import given, strategies as st
|
||||
import pytest
|
||||
|
||||
# Example: Testing a sort function
|
||||
def my_sort(lst):
|
||||
return sorted(lst)
|
||||
|
||||
# Property 1: Output length equals input length
|
||||
@given(st.lists(st.integers()))
|
||||
def test_sort_preserves_length(lst):
|
||||
assert len(my_sort(lst)) == len(lst)
|
||||
|
||||
# Property 2: Output is sorted (each element <= next element)
|
||||
@given(st.lists(st.integers()))
|
||||
def test_sort_is_sorted(lst):
|
||||
result = my_sort(lst)
|
||||
for i in range(len(result) - 1):
|
||||
assert result[i] <= result[i+1]
|
||||
|
||||
# Property 3: Output contains same elements as input (multiset equality)
|
||||
@given(st.lists(st.integers()))
|
||||
def test_sort_preserves_elements(lst):
|
||||
result = my_sort(lst)
|
||||
assert sorted(lst) == sorted(result) # Canonical form comparison
|
||||
|
||||
# Property 4: Idempotence (sorting twice = sorting once)
|
||||
@given(st.lists(st.integers()))
|
||||
def test_sort_is_idempotent(lst):
|
||||
result = my_sort(lst)
|
||||
assert my_sort(result) == result
|
||||
|
||||
# Property 5: Empty input → empty output
|
||||
def test_sort_empty_list():
|
||||
assert my_sort([]) == []
|
||||
|
||||
# Property 6: Single element → unchanged
|
||||
@given(st.integers())
|
||||
def test_sort_single_element(x):
|
||||
assert my_sort([x]) == [x]
|
||||
```
|
||||
|
||||
### Property-Based Testing Strategies
|
||||
|
||||
**For data transformations:**
|
||||
- Idempotence: `f(f(x)) == f(x)`
|
||||
- Round-trip: `decode(encode(x)) == x`
|
||||
- Commutativity: `f(g(x)) == g(f(x))`
|
||||
- Invariants: Properties that never change (e.g., sum after transformation)
|
||||
|
||||
**For numeric functions:**
|
||||
- Boundary conditions: Zero, negative, very large numbers
|
||||
- Inverse relationships: `f(f_inverse(x)) ≈ x`
|
||||
- Known identities: `sin²(x) + cos²(x) = 1`
|
||||
|
||||
**For string/list operations:**
|
||||
- Length preservation or predictable change
|
||||
- Character/element preservation
|
||||
- Order properties (sorted, reversed)
|
||||
|
||||
## 5. Advanced EDA Techniques
|
||||
|
||||
For high-dimensional, multi-modal, or complex data.
|
||||
|
||||
### Dimensionality Reduction
|
||||
|
||||
```python
|
||||
# PCA: Linear dimensionality reduction
|
||||
from sklearn.decomposition import PCA
|
||||
pca = PCA(n_components=2)
|
||||
X_pca = pca.fit_transform(X_scaled)
|
||||
print(f"Explained variance: {pca.explained_variance_ratio_}")
|
||||
|
||||
# t-SNE: Non-linear, good for visualization
|
||||
from sklearn.manifold import TSNE
|
||||
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
|
||||
X_tsne = tsne.fit_transform(X_scaled)
|
||||
plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y, cmap='viridis'); plt.show()
|
||||
|
||||
# UMAP: Faster alternative to t-SNE, preserves global structure
|
||||
# pip install umap-learn
|
||||
import umap
|
||||
reducer = umap.UMAP(n_components=2, random_state=42)
|
||||
X_umap = reducer.fit_transform(X_scaled)
|
||||
```
|
||||
|
||||
### Cluster Analysis
|
||||
|
||||
```python
|
||||
from sklearn.cluster import KMeans, DBSCAN
|
||||
from sklearn.metrics import silhouette_score
|
||||
|
||||
# Elbow method: Find optimal K
|
||||
inertias = []
|
||||
for k in range(2, 11):
|
||||
kmeans = KMeans(n_clusters=k, random_state=42)
|
||||
kmeans.fit(X_scaled)
|
||||
inertias.append(kmeans.inertia_)
|
||||
plt.plot(range(2, 11), inertias); plt.xlabel('K'); plt.ylabel('Inertia'); plt.show()
|
||||
|
||||
# Silhouette score: Measure cluster quality
|
||||
for k in range(2, 11):
|
||||
kmeans = KMeans(n_clusters=k, random_state=42).fit(X_scaled)
|
||||
score = silhouette_score(X_scaled, kmeans.labels_)
|
||||
print(f"K={k}: Silhouette={score:.3f}")
|
||||
|
||||
# DBSCAN: Density-based clustering (finds arbitrary shapes)
|
||||
dbscan = DBSCAN(eps=0.5, min_samples=5)
|
||||
clusters = dbscan.fit_predict(X_scaled)
|
||||
print(f"Clusters found: {len(set(clusters)) - (1 if -1 in clusters else 0)}")
|
||||
print(f"Noise points: {(clusters == -1).sum()}")
|
||||
```
|
||||
|
||||
## 6. Advanced Validation Patterns
|
||||
|
||||
### Mutation Testing
|
||||
|
||||
Tests the quality of your tests by introducing bugs and checking if tests catch them.
|
||||
|
||||
```python
|
||||
# Install: pip install mutmut
|
||||
# Run: mutmut run --paths-to-mutate=src/
|
||||
# Check: mutmut results
|
||||
# Survivors (mutations not caught) indicate weak tests
|
||||
```
|
||||
|
||||
### Fuzz Testing
|
||||
|
||||
Generate random/malformed inputs to find edge cases.
|
||||
|
||||
```python
|
||||
from hypothesis import given, strategies as st
|
||||
|
||||
@given(st.text())
|
||||
def test_function_doesnt_crash_on_any_string(s):
|
||||
result = my_function(s) # Should never raise exception
|
||||
assert result is not None
|
||||
```
|
||||
|
||||
### Data Validation Framework (Great Expectations)
|
||||
|
||||
```python
|
||||
import great_expectations as gx
|
||||
|
||||
# Define expectations
|
||||
expectation_suite = gx.ExpectationSuite(name="my_data_suite")
|
||||
expectation_suite.add_expectation(gx.expectations.ExpectColumnToExist(column="user_id"))
|
||||
expectation_suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="user_id"))
|
||||
expectation_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="age", min_value=0, max_value=120))
|
||||
|
||||
# Validate data
|
||||
results = context.run_validation(batch_request, expectation_suite)
|
||||
print(results["success"]) # True if all expectations met
|
||||
```
|
||||
|
||||
## 7. When to Use Each Method
|
||||
|
||||
| Research Goal | Method | Key Consideration |
|
||||
|---------------|--------|-------------------|
|
||||
| Causal effect estimation | RCT, IV, RDD, DiD | Identify confounders, check assumptions |
|
||||
| Prediction/forecasting | Supervised ML | Avoid data leakage, validate out-of-sample |
|
||||
| Pattern discovery | Clustering, PCA, t-SNE | Dimensionality reduction first if high-D |
|
||||
| Complex logic testing | Property-based testing | Define invariants that must hold |
|
||||
| Data quality | Great Expectations | Automate checks in pipelines |
|
||||
391
skills/code-data-analysis-scaffolds/resources/template.md
Normal file
391
skills/code-data-analysis-scaffolds/resources/template.md
Normal file
@@ -0,0 +1,391 @@
|
||||
# Code Data Analysis Scaffolds Template
|
||||
|
||||
## Workflow
|
||||
|
||||
Copy this checklist and track your progress:
|
||||
|
||||
```
|
||||
Code Data Analysis Scaffolds Progress:
|
||||
- [ ] Step 1: Clarify task and objectives
|
||||
- [ ] Step 2: Choose appropriate scaffold type
|
||||
- [ ] Step 3: Generate scaffold structure
|
||||
- [ ] Step 4: Validate scaffold completeness
|
||||
- [ ] Step 5: Deliver scaffold and guide execution
|
||||
```
|
||||
|
||||
**Step 1: Clarify task** - Ask context questions to understand task type, constraints, expected outcomes. See [Context Questions](#context-questions).
|
||||
|
||||
**Step 2: Choose scaffold** - Select TDD, EDA, Statistical Analysis, or Validation based on task. See [Scaffold Selection Guide](#scaffold-selection-guide).
|
||||
|
||||
**Step 3: Generate structure** - Use appropriate scaffold template. See [TDD Scaffold](#tdd-scaffold), [EDA Scaffold](#eda-scaffold), [Statistical Analysis Scaffold](#statistical-analysis-scaffold), or [Validation Scaffold](#validation-scaffold).
|
||||
|
||||
**Step 4: Validate completeness** - Check scaffold covers requirements, includes validation steps, makes assumptions explicit. See [Quality Checklist](#quality-checklist).
|
||||
|
||||
**Step 5: Deliver and guide** - Present scaffold, highlight next steps, surface any gaps discovered. Execute if user wants help.
|
||||
|
||||
## Context Questions
|
||||
|
||||
**For all tasks:**
|
||||
- What are you trying to accomplish? (Specific outcome expected)
|
||||
- What's the context? (Dataset characteristics, codebase state, existing work)
|
||||
- Any constraints? (Time, tools, data limitations, performance requirements)
|
||||
- What does success look like? (Acceptance criteria, quality bar)
|
||||
|
||||
**For TDD tasks:**
|
||||
- What functionality needs tests? (Feature, bug fix, refactor)
|
||||
- Existing test coverage? (None, partial, comprehensive)
|
||||
- Test framework preference? (pytest, jest, junit, etc.)
|
||||
- Integration vs unit tests? (Scope of testing)
|
||||
|
||||
**For EDA tasks:**
|
||||
- What's the dataset? (Size, format, source)
|
||||
- What questions are you trying to answer? (Exploratory vs. hypothesis-driven)
|
||||
- Existing knowledge about data? (Schema, distributions, known issues)
|
||||
- End goal? (Feature engineering, quality assessment, insights)
|
||||
|
||||
**For Statistical/Modeling tasks:**
|
||||
- What's the research question? (Descriptive, predictive, causal)
|
||||
- Available data? (Sample size, variables, treatment/control)
|
||||
- Causal or predictive goal? (Understanding why vs. forecasting what)
|
||||
- Significance level / acceptable error rate?
|
||||
|
||||
## Scaffold Selection Guide
|
||||
|
||||
| User Says | Task Type | Scaffold to Use |
|
||||
|-----------|-----------|-----------------|
|
||||
| "Write tests for..." | TDD | [TDD Scaffold](#tdd-scaffold) |
|
||||
| "Explore this dataset..." | EDA | [EDA Scaffold](#eda-scaffold) |
|
||||
| "Analyze the effect of..." / "Does X cause Y?" | Causal Inference | See methodology.md |
|
||||
| "Predict..." / "Classify..." / "Forecast..." | Predictive Modeling | See methodology.md |
|
||||
| "Design an A/B test..." / "Compare groups..." | Statistical Analysis | [Statistical Analysis Scaffold](#statistical-analysis-scaffold) |
|
||||
| "Validate..." / "Check quality..." | Validation | [Validation Scaffold](#validation-scaffold) |
|
||||
|
||||
## TDD Scaffold
|
||||
|
||||
Use when writing new code, refactoring, or fixing bugs. **Write tests FIRST, then implement.**
|
||||
|
||||
### Quick Template
|
||||
|
||||
```python
|
||||
# Test file: test_[module].py
|
||||
import pytest
|
||||
from [module] import [function_to_test]
|
||||
|
||||
# 1. HAPPY PATH TESTS (expected usage)
|
||||
def test_[function]_with_valid_input():
|
||||
"""Test normal, expected behavior"""
|
||||
result = [function](valid_input)
|
||||
assert result == expected_output
|
||||
assert result.property == expected_value
|
||||
|
||||
# 2. EDGE CASE TESTS (boundary conditions)
|
||||
def test_[function]_with_empty_input():
|
||||
"""Test with empty/minimal input"""
|
||||
result = [function]([])
|
||||
assert result == expected_for_empty
|
||||
|
||||
def test_[function]_with_maximum_input():
|
||||
"""Test with large/maximum input"""
|
||||
result = [function](large_input)
|
||||
assert result is not None
|
||||
|
||||
# 3. ERROR CONDITION TESTS (invalid input, expected failures)
|
||||
def test_[function]_with_invalid_input():
|
||||
"""Test proper error handling"""
|
||||
with pytest.raises(ValueError):
|
||||
[function](invalid_input)
|
||||
|
||||
def test_[function]_with_none_input():
|
||||
"""Test None handling"""
|
||||
with pytest.raises(TypeError):
|
||||
[function](None)
|
||||
|
||||
# 4. STATE TESTS (if function modifies state)
|
||||
def test_[function]_modifies_state_correctly():
|
||||
"""Test side effects are correct"""
|
||||
obj = Object()
|
||||
obj.[function](param)
|
||||
assert obj.state == expected_state
|
||||
|
||||
# 5. INTEGRATION TESTS (if interacting with external systems)
|
||||
@pytest.fixture
|
||||
def mock_external_service():
|
||||
"""Mock external dependencies"""
|
||||
return Mock(spec=ExternalService)
|
||||
|
||||
def test_[function]_with_external_service(mock_external_service):
|
||||
"""Test integration points"""
|
||||
result = [function](mock_external_service)
|
||||
mock_external_service.method.assert_called_once()
|
||||
assert result == expected_from_integration
|
||||
```
|
||||
|
||||
### Test Data Setup
|
||||
|
||||
```python
|
||||
# conftest.py or test fixtures
|
||||
@pytest.fixture
|
||||
def sample_data():
|
||||
"""Reusable test data"""
|
||||
return {
|
||||
"valid": [...],
|
||||
"edge_case": [...],
|
||||
"invalid": [...]
|
||||
}
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def database_session():
|
||||
"""Database for integration tests"""
|
||||
db = create_test_db()
|
||||
yield db
|
||||
db.cleanup()
|
||||
```
|
||||
|
||||
### TDD Cycle
|
||||
|
||||
1. **Red**: Write failing test (defines what success looks like)
|
||||
2. **Green**: Write minimal code to make test pass
|
||||
3. **Refactor**: Improve code while keeping tests green
|
||||
4. **Repeat**: Next test case
|
||||
|
||||
## EDA Scaffold
|
||||
|
||||
Use when exploring new dataset. Follow systematic plan to understand data quality and patterns.
|
||||
|
||||
### Quick Template
|
||||
|
||||
```python
|
||||
# 1. DATA OVERVIEW
|
||||
# Load and inspect
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
df = pd.read_[format]('data.csv')
|
||||
|
||||
# Basic info
|
||||
print(f"Shape: {df.shape}")
|
||||
print(f"Columns: {df.columns.tolist()}")
|
||||
print(df.dtypes)
|
||||
print(df.head())
|
||||
print(df.info())
|
||||
print(df.describe())
|
||||
|
||||
# 2. DATA QUALITY CHECKS
|
||||
# Missing values
|
||||
missing = df.isnull().sum()
|
||||
missing_pct = (missing / len(df)) * 100
|
||||
print(missing_pct[missing_pct > 0])
|
||||
|
||||
# Duplicates
|
||||
print(f"Duplicates: {df.duplicated().sum()}")
|
||||
|
||||
# Data types consistency
|
||||
print("Check: Are numeric columns actually numeric?")
|
||||
print("Check: Are dates parsed correctly?")
|
||||
print("Check: Are categorical variables encoded properly?")
|
||||
|
||||
# 3. UNIVARIATE ANALYSIS
|
||||
# Numeric: mean, median, std, range, distribution plots, outliers (IQR method)
|
||||
for col in df.select_dtypes(include=[np.number]).columns:
|
||||
print(f"{col}: mean={df[col].mean():.2f}, median={df[col].median():.2f}, std={df[col].std():.2f}")
|
||||
df[col].hist(bins=50); plt.title(f'{col} Distribution'); plt.show()
|
||||
Q1, Q3 = df[col].quantile([0.25, 0.75])
|
||||
outliers = ((df[col] < (Q1 - 1.5*(Q3-Q1))) | (df[col] > (Q3 + 1.5*(Q3-Q1)))).sum()
|
||||
print(f" Outliers: {outliers} ({outliers/len(df)*100:.1f}%)")
|
||||
|
||||
# Categorical: value counts, unique values, bar plots
|
||||
for col in df.select_dtypes(include=['object', 'category']).columns:
|
||||
print(f"{col}: {df[col].nunique()} unique, most common={df[col].mode()[0]}")
|
||||
df[col].value_counts().head(10).plot(kind='bar'); plt.show()
|
||||
|
||||
# 4. BIVARIATE ANALYSIS
|
||||
# Correlation heatmap, pairplots, categorical vs numeric boxplots
|
||||
sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')
|
||||
sns.pairplot(df[['var1', 'var2', 'var3', 'target']], hue='target'); plt.show()
|
||||
# For each categorical-numeric pair, create boxplots to see distributions
|
||||
|
||||
# 5. INSIGHTS & NEXT STEPS
|
||||
print("\n=== KEY FINDINGS ===")
|
||||
print("1. Data quality: [summary]")
|
||||
print("2. Distributions: [any skewness, outliers]")
|
||||
print("3. Correlations: [strong relationships found]")
|
||||
print("4. Missing patterns: [systematic missingness?]")
|
||||
print("\n=== RECOMMENDED ACTIONS ===")
|
||||
print("1. Handle missing data: [imputation strategy]")
|
||||
print("2. Address outliers: [cap, remove, transform]")
|
||||
print("3. Feature engineering: [ideas based on EDA]")
|
||||
print("4. Data transformations: [log, standardize, encode]")
|
||||
```
|
||||
|
||||
### EDA Checklist
|
||||
|
||||
- [ ] Load data and check shape/dtypes
|
||||
- [ ] Assess missing values (how much, which variables, patterns?)
|
||||
- [ ] Check for duplicates
|
||||
- [ ] Validate data types (numeric, categorical, dates)
|
||||
- [ ] Univariate analysis (distributions, outliers, summary stats)
|
||||
- [ ] Bivariate analysis (correlations, relationships with target)
|
||||
- [ ] Identify data quality issues
|
||||
- [ ] Document insights and recommended next steps
|
||||
|
||||
## Statistical Analysis Scaffold
|
||||
|
||||
Use for hypothesis testing, A/B tests, comparing groups.
|
||||
|
||||
### Quick Template
|
||||
|
||||
```python
|
||||
# STATISTICAL ANALYSIS SCAFFOLD
|
||||
|
||||
# 1. DEFINE RESEARCH QUESTION
|
||||
question = "Does treatment X improve outcome Y?"
|
||||
|
||||
# 2. STATE HYPOTHESES
|
||||
H0 = "Treatment X has no effect on outcome Y (null hypothesis)"
|
||||
H1 = "Treatment X improves outcome Y (alternative hypothesis)"
|
||||
|
||||
# 3. SET SIGNIFICANCE LEVEL
|
||||
alpha = 0.05 # 5% significance level (Type I error rate)
|
||||
power = 0.80 # 80% power (1 - Type II error rate)
|
||||
|
||||
# 4. CHECK ASSUMPTIONS (t-test: independence, normality, equal variance)
|
||||
from scipy import stats
|
||||
_, p_norm = stats.shapiro(treatment_group) # Normality test
|
||||
_, p_var = stats.levene(treatment_group, control_group) # Equal variance test
|
||||
print(f"Normality: p={p_norm:.3f} {'✓' if p_norm > 0.05 else '✗ use non-parametric'}")
|
||||
print(f"Equal variance: p={p_var:.3f} {'✓' if p_var > 0.05 else '✗ use Welch t-test'}")
|
||||
|
||||
# 5. PERFORM STATISTICAL TEST
|
||||
# Choose appropriate test based on data type and assumptions
|
||||
|
||||
# For continuous outcome, 2 groups:
|
||||
statistic, p_value = stats.ttest_ind(treatment_group, control_group)
|
||||
print(f"t-statistic: {statistic:.3f}, p-value: {p_value:.4f}")
|
||||
|
||||
# For categorical outcome, 2 groups:
|
||||
from scipy.stats import chi2_contingency
|
||||
contingency_table = pd.crosstab(df['group'], df['outcome'])
|
||||
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
|
||||
print(f"Chi-square: {chi2:.3f}, p-value: {p_value:.4f}")
|
||||
|
||||
# 6. INTERPRET RESULTS & EFFECT SIZE
|
||||
if p_value < alpha:
|
||||
cohen_d = (treatment_group.mean() - control_group.mean()) / pooled_std
|
||||
effect = "Small" if abs(cohen_d) < 0.2 else "Medium" if abs(cohen_d) < 0.5 else "Large"
|
||||
print(f"REJECT H0 (p={p_value:.4f}). Effect size (Cohen's d)={cohen_d:.3f} ({effect})")
|
||||
else:
|
||||
print(f"FAIL TO REJECT H0 (p={p_value:.4f}). Insufficient evidence for effect.")
|
||||
|
||||
# 7. CONFIDENCE INTERVAL & SENSITIVITY
|
||||
ci_95 = stats.t.interval(0.95, len(treatment_group)-1, loc=treatment_group.mean(), scale=stats.sem(treatment_group))
|
||||
print(f"95% CI: [{ci_95[0]:.2f}, {ci_95[1]:.2f}]")
|
||||
print("Sensitivity: Check without outliers, with non-parametric test, with confounders")
|
||||
```
|
||||
|
||||
### Statistical Test Selection
|
||||
|
||||
| Data Type | # Groups | Test |
|
||||
|-----------|----------|------|
|
||||
| Continuous | 2 | t-test (or Welch's if unequal variance) |
|
||||
| Continuous | 3+ | ANOVA (or Kruskal-Wallis if non-normal) |
|
||||
| Categorical | 2 | Chi-square or Fisher's exact |
|
||||
| Ordinal | 2 | Mann-Whitney U |
|
||||
| Paired/Repeated | 2 | Paired t-test or Wilcoxon signed-rank |
|
||||
|
||||
## Validation Scaffold
|
||||
|
||||
Use for validating data quality, code quality, or model quality before shipping.
|
||||
|
||||
### Data Validation Template
|
||||
|
||||
```python
|
||||
# DATA VALIDATION CHECKLIST
|
||||
|
||||
# 1. SCHEMA VALIDATION
|
||||
expected_columns = ['id', 'timestamp', 'value', 'category']
|
||||
assert set(df.columns) == set(expected_columns), "Column mismatch"
|
||||
|
||||
expected_dtypes = {'id': 'int64', 'timestamp': 'datetime64', 'value': 'float64', 'category': 'object'}
|
||||
for col, dtype in expected_dtypes.items():
|
||||
assert df[col].dtype == dtype, f"{col} type mismatch: expected {dtype}, got {df[col].dtype}"
|
||||
|
||||
# 2. RANGE VALIDATION
|
||||
assert df['value'].min() >= 0, "Negative values found (should be >= 0)"
|
||||
assert df['value'].max() <= 100, "Values exceed maximum (should be <= 100)"
|
||||
|
||||
# 3. UNIQUENESS VALIDATION
|
||||
assert df['id'].is_unique, "Duplicate IDs found"
|
||||
|
||||
# 4. COMPLETENESS VALIDATION
|
||||
required_fields = ['id', 'value']
|
||||
for field in required_fields:
|
||||
missing_pct = df[field].isnull().mean() * 100
|
||||
assert missing_pct == 0, f"{field} has {missing_pct:.1f}% missing (required field)"
|
||||
|
||||
# 5. CONSISTENCY VALIDATION
|
||||
assert (df['start_date'] <= df['end_date']).all(), "start_date after end_date found"
|
||||
|
||||
# 6. REFERENTIAL INTEGRITY
|
||||
valid_categories = ['A', 'B', 'C']
|
||||
assert df['category'].isin(valid_categories).all(), "Invalid categories found"
|
||||
|
||||
print("✓ All data validations passed")
|
||||
```
|
||||
|
||||
### Code Validation Checklist
|
||||
|
||||
- [ ] **Unit tests**: All functions have tests covering happy path, edge cases, errors
|
||||
- [ ] **Integration tests**: APIs, database interactions tested end-to-end
|
||||
- [ ] **Test coverage**: ≥80% coverage for critical paths
|
||||
- [ ] **Error handling**: All exceptions caught and handled gracefully
|
||||
- [ ] **Input validation**: All user inputs validated before processing
|
||||
- [ ] **Logging**: Key operations logged for debugging
|
||||
- [ ] **Documentation**: Functions have docstrings, README updated
|
||||
- [ ] **Performance**: No obvious performance bottlenecks (profiled if needed)
|
||||
- [ ] **Security**: No hardcoded secrets, SQL injection protected, XSS prevented
|
||||
|
||||
### Model Validation Checklist
|
||||
|
||||
- [ ] **Train/val/test split**: Data split before any preprocessing (no data leakage)
|
||||
- [ ] **Baseline model**: Simple baseline implemented for comparison
|
||||
- [ ] **Cross-validation**: k-fold CV performed (k≥5)
|
||||
- [ ] **Metrics**: Appropriate metrics chosen (accuracy, precision/recall, AUC, RMSE, etc.)
|
||||
- [ ] **Overfitting check**: Training vs validation performance compared
|
||||
- [ ] **Error analysis**: Failure modes analyzed, edge cases identified
|
||||
- [ ] **Fairness**: Model checked for bias across sensitive groups
|
||||
- [ ] **Interpretability**: Feature importance or SHAP values computed
|
||||
- [ ] **Robustness**: Model tested with perturbed inputs
|
||||
- [ ] **Monitoring**: Drift detection and performance tracking in place
|
||||
|
||||
## Quality Checklist
|
||||
|
||||
Before delivering, verify:
|
||||
|
||||
**Scaffold Structure:**
|
||||
- [ ] Clear step-by-step process defined
|
||||
- [ ] Each step has concrete actions (not vague advice)
|
||||
- [ ] Validation checkpoints included
|
||||
- [ ] Expected outputs specified
|
||||
|
||||
**Completeness:**
|
||||
- [ ] Covers all requirements from user's task
|
||||
- [ ] Includes example code/pseudocode where helpful
|
||||
- [ ] Anticipates edge cases and error conditions
|
||||
- [ ] Provides decision guidance (when to use which approach)
|
||||
|
||||
**Clarity:**
|
||||
- [ ] Assumptions stated explicitly
|
||||
- [ ] Technical terms defined or illustrated
|
||||
- [ ] Success criteria clear
|
||||
- [ ] Next steps obvious
|
||||
|
||||
**Actionability:**
|
||||
- [ ] User can execute scaffold without further guidance
|
||||
- [ ] Code snippets are runnable (or nearly runnable)
|
||||
- [ ] Gaps surfaced early (missing data, unclear requirements)
|
||||
- [ ] Includes validation/quality checks
|
||||
|
||||
**Rubric Score:**
|
||||
- [ ] Self-assessed with rubric ≥ 3.5 average
|
||||
Reference in New Issue
Block a user