Initial commit

2025-11-30 08:38:26 +08:00
commit 41d9f6b189
304 changed files with 98322 additions and 0 deletions
--- a/skills/code-data-analysis-scaffolds/resources/evaluators/rubric_code_data_analysis_scaffolds.json
+++ b/skills/code-data-analysis-scaffolds/resources/evaluators/rubric_code_data_analysis_scaffolds.json
@@ -0,0 +1,314 @@
+{
+  "criteria": [
+    {
+      "name": "Scaffold Structure Clarity",
+      "description": "Is the scaffold structure clear, systematic, and easy to follow?",
+      "scoring": {
+        "1": "No clear structure. Random collection of steps/checks without logical flow.",
+        "2": "Basic structure but steps are vague or out of order. User confused about what to do next.",
+        "3": "Clear structure with defined steps. User can follow but may need clarification on some steps.",
+        "4": "Well-organized structure with clear steps, checkpoints, and expected outputs at each stage.",
+        "5": "Exemplary structure: systematic, numbered steps with clear inputs/outputs, decision points explicit."
+      },
+      "red_flags": [
+        "Steps not numbered or sequenced",
+        "No clear starting/ending point",
+        "Validation steps missing",
+        "User must guess what to do next"
+      ]
+    },
+    {
+      "name": "Coverage Completeness",
+      "description": "Does the scaffold cover all necessary aspects (happy path, edge cases, validation, etc.)?",
+      "scoring": {
+        "1": "Major gaps. Only covers happy path, ignores edge cases/errors/validation.",
+        "2": "Partial coverage. Addresses main case but misses important edge cases or validation steps.",
+        "3": "Adequate coverage. Main cases and some edge cases covered. Basic validation included.",
+        "4": "Comprehensive coverage. Happy path, edge cases, error conditions, validation all included.",
+        "5": "Exhaustive coverage. All cases, validation at each step, robustness checks, limitations documented."
+      },
+      "red_flags": [
+        "TDD scaffold: No tests for edge cases or errors",
+        "EDA scaffold: Missing data quality checks",
+        "Statistical scaffold: No assumption checks",
+        "Any scaffold: No validation step before delivering"
+      ]
+    },
+    {
+      "name": "Technical Rigor",
+      "description": "Is the approach technically sound with appropriate methods/tests?",
+      "scoring": {
+        "1": "Technically incorrect. Wrong methods, flawed logic, or inappropriate techniques.",
+        "2": "Questionable rigor. Some techniques correct but others questionable or missing justification.",
+        "3": "Adequate rigor. Standard techniques applied correctly. Acceptable for routine work.",
+        "4": "High rigor. Appropriate methods, assumptions checked, sensitivity analysis included.",
+        "5": "Exemplary rigor. Best practices followed, multiple validation approaches, limitations acknowledged."
+      },
+      "red_flags": [
+        "Causal inference without DAG or identification strategy",
+        "Statistical test without checking assumptions",
+        "ML model without train/val/test split (data leakage)",
+        "TDD without testing error conditions"
+      ]
+    },
+    {
+      "name": "Actionability",
+      "description": "Can user execute scaffold without further guidance? Are examples concrete?",
+      "scoring": {
+        "1": "Not actionable. Vague advice, no concrete steps, no code examples.",
+        "2": "Somewhat actionable. General direction but user needs to figure out details.",
+        "3": "Actionable. Clear steps with code snippets. User can execute with minor adjustments.",
+        "4": "Highly actionable. Complete code examples, data assumptions stated, ready to adapt.",
+        "5": "Immediately executable. Copy-paste ready examples with inline comments, expected outputs shown."
+      },
+      "red_flags": [
+        "No code examples (just prose descriptions)",
+        "Code has placeholders without explaining what to fill in",
+        "No example inputs/outputs",
+        "Vague instructions ('check assumptions', 'validate results' without saying how)"
+      ]
+    },
+    {
+      "name": "Test Quality (for TDD)",
+      "description": "For TDD scaffolds: Do tests cover happy path, edge cases, errors, and integration?",
+      "scoring": {
+        "1": "Only happy path tests. No edge cases, errors, or integration tests.",
+        "2": "Happy path + some edge cases. Error handling or integration missing.",
+        "3": "Happy path, edge cases, basic error tests. Integration tests may be missing.",
+        "4": "Comprehensive: Happy path, edge cases, error conditions, integration tests all present.",
+        "5": "Exemplary: Above + property-based tests, test fixtures, mocks for external dependencies."
+      },
+      "red_flags": [
+        "No tests for None/empty input",
+        "No tests for expected exceptions",
+        "No tests for state changes/side effects",
+        "No integration tests for external systems"
+      ],
+      "applicable_to": ["TDD"]
+    },
+    {
+      "name": "Data Quality Assessment (for EDA)",
+      "description": "For EDA scaffolds: Are data quality checks (missing, duplicates, outliers, consistency) included?",
+      "scoring": {
+        "1": "No data quality checks. Jumps straight to analysis without inspecting data.",
+        "2": "Minimal checks. Maybe checks missing values but ignores duplicates, outliers, consistency.",
+        "3": "Basic quality checks. Missing values, duplicates, basic outliers checked.",
+        "4": "Thorough quality checks. Missing patterns, duplicates, outliers, type consistency, referential integrity.",
+        "5": "Comprehensive quality framework. All checks + distributions, cardinality, data lineage, validation rules."
+      },
+      "red_flags": [
+        "No check for missing values",
+        "No check for duplicates",
+        "No outlier detection",
+        "Assumes data is clean without validation"
+      ],
+      "applicable_to": ["EDA", "Statistical Analysis", "Predictive Modeling"]
+    },
+    {
+      "name": "Assumption Documentation",
+      "description": "Are assumptions explicitly stated and justified?",
+      "scoring": {
+        "1": "No assumptions stated. User unaware of what's being assumed.",
+        "2": "Some assumptions implicit but not documented. User must infer them.",
+        "3": "Key assumptions stated but not justified or validated.",
+        "4": "Assumptions explicitly stated with justification. User knows what's assumed and why.",
+        "5": "Assumptions stated, justified, validated where possible, and sensitivity to violations analyzed."
+      },
+      "red_flags": [
+        "Statistical test applied without stating/checking assumptions",
+        "Causal claim without stating identification assumptions",
+        "ML model without documenting train/test split assumptions",
+        "Function implementation without stating preconditions"
+      ]
+    },
+    {
+      "name": "Validation Steps Included",
+      "description": "Does scaffold include validation/quality checks before delivering results?",
+      "scoring": {
+        "1": "No validation. Results delivered without any quality checks.",
+        "2": "Informal validation. 'Looks good' without systematic checks.",
+        "3": "Basic validation. Some checks but not comprehensive or systematic.",
+        "4": "Systematic validation. Checklist of quality criteria, most items checked.",
+        "5": "Rigorous validation framework. Multiple validation approaches, robustness checks, edge cases tested."
+      },
+      "red_flags": [
+        "No validation step in workflow",
+        "No rubric or checklist to assess quality",
+        "No test suite execution before delivering code",
+        "No sensitivity analysis for statistical results"
+      ]
+    },
+    {
+      "name": "Code/Analysis Quality",
+      "description": "Is code well-structured, readable, and following best practices?",
+      "scoring": {
+        "1": "Poor quality. Spaghetti code, no structure, hard to understand.",
+        "2": "Low quality. Works but hard to read, poor naming, no comments.",
+        "3": "Adequate quality. Readable, basic structure, some comments. Acceptable for prototypes.",
+        "4": "Good quality. Clean code, good naming, appropriate comments, follows style guide.",
+        "5": "Excellent quality. Modular, DRY, well-documented, type hints, follows SOLID principles."
+      },
+      "red_flags": [
+        "Magic numbers without explanation",
+        "Copy-pasted code (not DRY)",
+        "Functions doing multiple unrelated things",
+        "No docstrings or comments explaining complex logic"
+      ]
+    },
+    {
+      "name": "Reproducibility",
+      "description": "Can another person reproduce the analysis/tests with provided information?",
+      "scoring": {
+        "1": "Not reproducible. Missing critical information (data, packages, random seeds).",
+        "2": "Partially reproducible. Some information provided but key details missing.",
+        "3": "Mostly reproducible. Enough information for skilled practitioner to reproduce with effort.",
+        "4": "Reproducible. All information provided (data access, package versions, random seeds, parameters).",
+        "5": "Fully reproducible. Documented environment, requirements.txt, Docker container, or notebook with all steps."
+      },
+      "red_flags": [
+        "No package versions specified",
+        "Random operations without setting seed",
+        "Data source not documented or inaccessible",
+        "No instructions for running tests/analysis"
+      ]
+    }
+  ],
+  "task_type_guidance": {
+    "TDD": {
+      "description": "Test-Driven Development scaffolds",
+      "focus_criteria": [
+        "Test Quality",
+        "Code/Analysis Quality",
+        "Validation Steps Included"
+      ],
+      "target_score": 3.5,
+      "success_indicators": [
+        "Tests written before implementation",
+        "Happy path, edge cases, errors all tested",
+        "Tests pass and are maintainable",
+        "Red-Green-Refactor cycle followed"
+      ]
+    },
+    "EDA": {
+      "description": "Exploratory Data Analysis scaffolds",
+      "focus_criteria": [
+        "Data Quality Assessment",
+        "Coverage Completeness",
+        "Assumption Documentation"
+      ],
+      "target_score": 3.5,
+      "success_indicators": [
+        "Data quality systematically checked",
+        "Univariate and bivariate analysis completed",
+        "Insights and recommendations documented",
+        "Missing values, outliers, distributions analyzed"
+      ]
+    },
+    "Statistical Analysis": {
+      "description": "Hypothesis testing, A/B tests, causal inference",
+      "focus_criteria": [
+        "Technical Rigor",
+        "Assumption Documentation",
+        "Validation Steps Included"
+      ],
+      "target_score": 4.0,
+      "success_indicators": [
+        "Hypotheses clearly stated",
+        "Appropriate test selected and justified",
+        "Assumptions checked (normality, independence, etc.)",
+        "Effect sizes and confidence intervals reported",
+        "Sensitivity analysis performed"
+      ]
+    },
+    "Predictive Modeling": {
+      "description": "ML model building and evaluation",
+      "focus_criteria": [
+        "Technical Rigor",
+        "Validation Steps Included",
+        "Reproducibility"
+      ],
+      "target_score": 4.0,
+      "success_indicators": [
+        "Train/val/test split before preprocessing (no data leakage)",
+        "Baseline model for comparison",
+        "Cross-validation performed",
+        "Error analysis and feature importance computed",
+        "Model deployment checklist completed"
+      ]
+    },
+    "Validation": {
+      "description": "Data/code/model quality checks",
+      "focus_criteria": [
+        "Coverage Completeness",
+        "Validation Steps Included",
+        "Technical Rigor"
+      ],
+      "target_score": 4.0,
+      "success_indicators": [
+        "Schema validation (types, ranges, constraints)",
+        "Referential integrity checked",
+        "Edge cases tested",
+        "Monitoring/alerting strategy defined"
+      ]
+    }
+  },
+  "common_failure_modes": [
+    {
+      "failure_mode": "Jumping to Implementation Without Scaffold",
+      "symptoms": "User writes code/analysis immediately without planning structure first.",
+      "consequences": "Missing edge cases, poor test coverage, incomplete analysis.",
+      "fix": "Force scaffold creation before implementation. Use template as checklist."
+    },
+    {
+      "failure_mode": "Testing Only Happy Path",
+      "symptoms": "TDD scaffold has tests for expected usage but none for errors/edge cases.",
+      "consequences": "Code breaks in production on unexpected inputs.",
+      "fix": "Require tests for: empty input, None, boundary values, invalid types, expected exceptions."
+    },
+    {
+      "failure_mode": "Skipping Data Quality Checks",
+      "symptoms": "EDA scaffold jumps to visualization without checking missing values, outliers, duplicates.",
+      "consequences": "Invalid conclusions based on dirty data.",
+      "fix": "Mandatory data quality section before any analysis. No exceptions."
+    },
+    {
+      "failure_mode": "Assumptions Not Documented",
+      "symptoms": "Statistical test applied without stating/checking assumptions (normality, independence, etc.).",
+      "consequences": "Invalid statistical inference. Wrong conclusions.",
+      "fix": "Explicit assumption section in scaffold. Check assumptions before applying test."
+    },
+    {
+      "failure_mode": "No Validation Step",
+      "symptoms": "Scaffold delivers results without any quality check or self-assessment.",
+      "consequences": "Low-quality outputs, errors not caught.",
+      "fix": "Mandatory validation step in workflow. Use rubric self-assessment."
+    },
+    {
+      "failure_mode": "Correlation Interpreted as Causation",
+      "symptoms": "EDA finds correlation, claims causal relationship without causal inference methods.",
+      "consequences": "Wrong business decisions based on spurious causality.",
+      "fix": "Distinguish predictive (correlation) from causal questions. Use causal inference methodology if claiming causation."
+    },
+    {
+      "failure_mode": "Data Leakage in ML",
+      "symptoms": "Preprocessing (scaling, imputation) done before train/test split.",
+      "consequences": "Overly optimistic model performance. Fails in production.",
+      "fix": "Scaffold enforces: split first, then preprocess. Fit transformers on train only."
+    },
+    {
+      "failure_mode": "Code Without Tests",
+      "symptoms": "Implementation provided but no test scaffold or test execution.",
+      "consequences": "Regressions not caught, bugs in production.",
+      "fix": "TDD scaffold mandatory for production code. Tests must pass before code review."
+    }
+  ],
+  "scale": 5,
+  "minimum_average_score": 3.5,
+  "interpretation": {
+    "1.0-2.0": "Inadequate. Major gaps in structure, coverage, or rigor. Do not use. Revise scaffold.",
+    "2.0-3.0": "Needs improvement. Basic structure present but incomplete or lacks rigor. Acceptable for learning/practice only.",
+    "3.0-3.5": "Acceptable. Covers main cases with adequate rigor. Suitable for routine work or prototypes.",
+    "3.5-4.0": "Good. Comprehensive coverage with good rigor. Suitable for production code/analysis.",
+    "4.0-5.0": "Excellent. Exemplary structure, rigor, and completeness. Production-ready with best practices."
+  }
+}