Initial commit

2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions
--- a/skills/clinical-decision-support/scripts/validate_cds_document.py
+++ b/skills/clinical-decision-support/scripts/validate_cds_document.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+"""
+Validate Clinical Decision Support Documents for Quality and Completeness
+
+Checks for:
+- Evidence citations for all recommendations
+- Statistical reporting completeness
+- Biomarker nomenclature consistency
+- Required sections present
+- HIPAA de-identification
+- GRADE recommendation format
+
+Dependencies: None (pure Python)
+"""
+
+import re
+import argparse
+from pathlib import Path
+from collections import defaultdict
+
+
+class CDSValidator:
+    """Validator for clinical decision support documents."""
+    
+    def __init__(self, filepath):
+        self.filepath = filepath
+        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+            self.content = f.read()
+        
+        self.errors = []
+        self.warnings = []
+        self.info = []
+    
+    def validate_all(self):
+        """Run all validation checks."""
+        
+        print(f"Validating: {self.filepath}")
+        print("="*70)
+        
+        self.check_required_sections()
+        self.check_evidence_citations()
+        self.check_recommendation_grading()
+        self.check_statistical_reporting()
+        self.check_hipaa_identifiers()
+        self.check_biomarker_nomenclature()
+        
+        return self.generate_report()
+    
+    def check_required_sections(self):
+        """Check if required sections are present."""
+        
+        # Cohort analysis required sections
+        cohort_sections = [
+            'cohort characteristics',
+            'biomarker',
+            'outcomes',
+            'statistical analysis',
+            'clinical implications',
+            'references'
+        ]
+        
+        # Treatment recommendation required sections
+        rec_sections = [
+            'evidence',
+            'recommendation',
+            'monitoring',
+            'references'
+        ]
+        
+        content_lower = self.content.lower()
+        
+        # Check which document type
+        is_cohort = 'cohort' in content_lower
+        is_recommendation = 'recommendation' in content_lower
+        
+        if is_cohort:
+            missing = [sec for sec in cohort_sections if sec not in content_lower]
+            if missing:
+                self.warnings.append(f"Cohort analysis may be missing sections: {', '.join(missing)}")
+            else:
+                self.info.append("All cohort analysis sections present")
+        
+        if is_recommendation:
+            missing = [sec for sec in rec_sections if sec not in content_lower]
+            if missing:
+                self.errors.append(f"Recommendation document missing required sections: {', '.join(missing)}")
+            else:
+                self.info.append("All recommendation sections present")
+    
+    def check_evidence_citations(self):
+        """Check that recommendations have citations."""
+        
+        # Find recommendation statements
+        rec_pattern = r'(recommend|should|prefer|suggest|consider)(.*?)(?:\n\n|\Z)'
+        recommendations = re.findall(rec_pattern, self.content, re.IGNORECASE | re.DOTALL)
+        
+        # Find citations  
+        citation_patterns = [
+            r'\[\d+\]',  # Numbered citations [1]
+            r'\(.*?\d{4}\)',  # Author year (Smith 2020)
+            r'et al\.',  # Et al citations
+            r'NCCN|ASCO|ESMO',  # Guideline references
+        ]
+        
+        uncited_recommendations = []
+        
+        for i, (_, rec_text) in enumerate(recommendations):
+            has_citation = any(re.search(pattern, rec_text) for pattern in citation_patterns)
+            
+            if not has_citation:
+                snippet = rec_text[:60].strip() + '...'
+                uncited_recommendations.append(snippet)
+        
+        if uncited_recommendations:
+            self.warnings.append(f"Found {len(uncited_recommendations)} recommendations without citations")
+            for rec in uncited_recommendations[:3]:  # Show first 3
+                self.warnings.append(f"  - {rec}")
+        else:
+            self.info.append(f"All {len(recommendations)} recommendations have citations")
+    
+    def check_recommendation_grading(self):
+        """Check for GRADE-style recommendation strength."""
+        
+        # Look for GRADE notation (1A, 1B, 2A, 2B, 2C)
+        grade_pattern = r'GRADE\s*[12][A-C]|Grade\s*[12][A-C]|\(?\s*[12][A-C]\s*\)?'
+        grades = re.findall(grade_pattern, self.content, re.IGNORECASE)
+        
+        # Look for strong/conditional language
+        strong_pattern = r'(strong|we recommend|should)'
+        conditional_pattern = r'(conditional|weak|we suggest|may consider|could consider)'
+        
+        strong_count = len(re.findall(strong_pattern, self.content, re.IGNORECASE))
+        conditional_count = len(re.findall(conditional_pattern, self.content, re.IGNORECASE))
+        
+        if grades:
+            self.info.append(f"Found {len(grades)} GRADE-style recommendations")
+        else:
+            self.warnings.append("No GRADE-style recommendation grading found (1A, 1B, 2A, etc.)")
+        
+        if strong_count > 0 or conditional_count > 0:
+            self.info.append(f"Recommendation language: {strong_count} strong, {conditional_count} conditional")
+        else:
+            self.warnings.append("No clear recommendation strength language (strong/conditional) found")
+    
+    def check_statistical_reporting(self):
+        """Check for proper statistical reporting."""
+        
+        # Check for p-values
+        p_values = re.findall(r'p\s*[=<>]\s*[\d.]+', self.content, re.IGNORECASE)
+        
+        # Check for confidence intervals
+        ci_pattern = r'95%\s*CI|confidence interval'
+        cis = re.findall(ci_pattern, self.content, re.IGNORECASE)
+        
+        # Check for hazard ratios
+        hr_pattern = r'HR\s*[=:]\s*[\d.]+'
+        hrs = re.findall(hr_pattern, self.content)
+        
+        # Check for sample sizes
+        n_pattern = r'n\s*=\s*\d+'
+        sample_sizes = re.findall(n_pattern, self.content, re.IGNORECASE)
+        
+        if not p_values:
+            self.warnings.append("No p-values found - statistical significance not reported")
+        else:
+            self.info.append(f"Found {len(p_values)} p-values")
+        
+        if hrs and not cis:
+            self.warnings.append("Hazard ratios reported without confidence intervals")
+        
+        if not sample_sizes:
+            self.warnings.append("Sample sizes (n=X) not clearly reported")
+        
+        # Check for common statistical errors
+        if 'p=0.00' in self.content or 'p = 0.00' in self.content:
+            self.warnings.append("Found p=0.00 (should report as p<0.001 instead)")
+    
+    def check_hipaa_identifiers(self):
+        """Check for potential HIPAA identifiers."""
+        
+        # 18 HIPAA identifiers (simplified check for common ones)
+        identifiers = {
+            'Names': r'Dr\.\s+[A-Z][a-z]+|Patient:\s*[A-Z][a-z]+',
+            'Specific dates': r'\d{1,2}/\d{1,2}/\d{4}',  # MM/DD/YYYY
+            'Phone numbers': r'\d{3}[-.]?\d{3}[-.]?\d{4}',
+            'Email addresses': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
+            'SSN': r'\d{3}-\d{2}-\d{4}',
+            'MRN': r'MRN\s*:?\s*\d+',
+        }
+        
+        found_identifiers = []
+        
+        for identifier_type, pattern in identifiers.items():
+            matches = re.findall(pattern, self.content)
+            if matches:
+                found_identifiers.append(f"{identifier_type}: {len(matches)} instance(s)")
+        
+        if found_identifiers:
+            self.errors.append("Potential HIPAA identifiers detected:")
+            for identifier in found_identifiers:
+                self.errors.append(f"  - {identifier}")
+            self.errors.append("  ** Ensure proper de-identification before distribution **")
+        else:
+            self.info.append("No obvious HIPAA identifiers detected (basic check only)")
+    
+    def check_biomarker_nomenclature(self):
+        """Check for consistent biomarker nomenclature."""
+        
+        # Common biomarker naming issues
+        issues = []
+        
+        # Check for gene names (should be italicized in LaTeX)
+        gene_names = ['EGFR', 'ALK', 'ROS1', 'BRAF', 'KRAS', 'HER2', 'TP53', 'BRCA1', 'BRCA2']
+        for gene in gene_names:
+            # Check if gene appears but not in italics (\textit{} or \emph{})
+            if gene in self.content:
+                if f'\\textit{{{gene}}}' not in self.content and f'\\emph{{{gene}}}' not in self.content:
+                    if '.tex' in self.filepath.suffix:
+                        issues.append(f"{gene} should be italicized in LaTeX (\\textit{{{gene}}})")
+        
+        # Check for protein vs gene naming
+        # HER2 (protein) vs ERBB2 (gene) - both valid
+        # Check for mutation nomenclature (HGVS format)
+        hgvs_pattern = r'p\.[A-Z]\d+[A-Z]'  # e.g., p.L858R
+        hgvs_mutations = re.findall(hgvs_pattern, self.content)
+        
+        if hgvs_mutations:
+            self.info.append(f"Found {len(hgvs_mutations)} HGVS protein nomenclature (e.g., p.L858R)")
+        
+        # Warn about non-standard mutation format
+        if 'EGFR mutation' in self.content and 'exon' not in self.content.lower():
+            self.warnings.append("EGFR mutation mentioned - specify exon/variant (e.g., exon 19 deletion)")
+        
+        if issues:
+            self.warnings.extend(issues)
+    
+    def generate_report(self):
+        """Generate validation report."""
+        
+        print("\n" + "="*70)
+        print("VALIDATION REPORT")
+        print("="*70)
+        
+        if self.errors:
+            print(f"\n❌ ERRORS ({len(self.errors)}):")
+            for error in self.errors:
+                print(f"  {error}")
+        
+        if self.warnings:
+            print(f"\n⚠️  WARNINGS ({len(self.warnings)}):")
+            for warning in self.warnings:
+                print(f"  {warning}")
+        
+        if self.info:
+            print(f"\n✓ PASSED CHECKS ({len(self.info)}):")
+            for info in self.info:
+                print(f"  {info}")
+        
+        # Overall status
+        print("\n" + "="*70)
+        if self.errors:
+            print("STATUS: ❌ VALIDATION FAILED - Address errors before distribution")
+            return False
+        elif self.warnings:
+            print("STATUS: ⚠️  VALIDATION PASSED WITH WARNINGS - Review recommended")
+            return True
+        else:
+            print("STATUS: ✓ VALIDATION PASSED - Document meets quality standards")
+            return True
+    
+    def save_report(self, output_file):
+        """Save validation report to file."""
+        
+        with open(output_file, 'w') as f:
+            f.write("CLINICAL DECISION SUPPORT DOCUMENT VALIDATION REPORT\n")
+            f.write("="*70 + "\n")
+            f.write(f"Document: {self.filepath}\n")
+            f.write(f"Validated: {Path.cwd()}\n\n")
+            
+            if self.errors:
+                f.write(f"ERRORS ({len(self.errors)}):\n")
+                for error in self.errors:
+                    f.write(f"  - {error}\n")
+                f.write("\n")
+            
+            if self.warnings:
+                f.write(f"WARNINGS ({len(self.warnings)}):\n")
+                for warning in self.warnings:
+                    f.write(f"  - {warning}\n")
+                f.write("\n")
+            
+            if self.info:
+                f.write(f"PASSED CHECKS ({len(self.info)}):\n")
+                for info in self.info:
+                    f.write(f"  - {info}\n")
+        
+        print(f"\nValidation report saved to: {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Validate clinical decision support documents')
+    parser.add_argument('input_file', type=str, help='Document to validate (.tex, .md, .txt)')
+    parser.add_argument('-o', '--output', type=str, default=None,
+                       help='Save validation report to file')
+    parser.add_argument('--strict', action='store_true',
+                       help='Treat warnings as errors')
+    
+    args = parser.parse_args()
+    
+    # Validate
+    validator = CDSValidator(args.input_file)
+    passed = validator.validate_all()
+    
+    # Save report if requested
+    if args.output:
+        validator.save_report(args.output)
+    
+    # Exit code
+    if args.strict and (validator.errors or validator.warnings):
+        exit(1)
+    elif validator.errors:
+        exit(1)
+    else:
+        exit(0)
+
+
+if __name__ == '__main__':
+    main()
+
+
+# Example usage:
+# python validate_cds_document.py cohort_analysis.tex
+# python validate_cds_document.py treatment_recommendations.tex -o validation_report.txt
+# python validate_cds_document.py document.tex --strict  # Warnings cause failure
+