#!/usr/bin/env python3 """ Validate Clinical Decision Support Documents for Quality and Completeness Checks for: - Evidence citations for all recommendations - Statistical reporting completeness - Biomarker nomenclature consistency - Required sections present - HIPAA de-identification - GRADE recommendation format Dependencies: None (pure Python) """ import re import argparse from pathlib import Path from collections import defaultdict class CDSValidator: """Validator for clinical decision support documents.""" def __init__(self, filepath): self.filepath = filepath with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: self.content = f.read() self.errors = [] self.warnings = [] self.info = [] def validate_all(self): """Run all validation checks.""" print(f"Validating: {self.filepath}") print("="*70) self.check_required_sections() self.check_evidence_citations() self.check_recommendation_grading() self.check_statistical_reporting() self.check_hipaa_identifiers() self.check_biomarker_nomenclature() return self.generate_report() def check_required_sections(self): """Check if required sections are present.""" # Cohort analysis required sections cohort_sections = [ 'cohort characteristics', 'biomarker', 'outcomes', 'statistical analysis', 'clinical implications', 'references' ] # Treatment recommendation required sections rec_sections = [ 'evidence', 'recommendation', 'monitoring', 'references' ] content_lower = self.content.lower() # Check which document type is_cohort = 'cohort' in content_lower is_recommendation = 'recommendation' in content_lower if is_cohort: missing = [sec for sec in cohort_sections if sec not in content_lower] if missing: self.warnings.append(f"Cohort analysis may be missing sections: {', '.join(missing)}") else: self.info.append("All cohort analysis sections present") if is_recommendation: missing = [sec for sec in rec_sections if sec not in content_lower] if missing: self.errors.append(f"Recommendation document missing required sections: {', '.join(missing)}") else: self.info.append("All recommendation sections present") def check_evidence_citations(self): """Check that recommendations have citations.""" # Find recommendation statements rec_pattern = r'(recommend|should|prefer|suggest|consider)(.*?)(?:\n\n|\Z)' recommendations = re.findall(rec_pattern, self.content, re.IGNORECASE | re.DOTALL) # Find citations citation_patterns = [ r'\[\d+\]', # Numbered citations [1] r'\(.*?\d{4}\)', # Author year (Smith 2020) r'et al\.', # Et al citations r'NCCN|ASCO|ESMO', # Guideline references ] uncited_recommendations = [] for i, (_, rec_text) in enumerate(recommendations): has_citation = any(re.search(pattern, rec_text) for pattern in citation_patterns) if not has_citation: snippet = rec_text[:60].strip() + '...' uncited_recommendations.append(snippet) if uncited_recommendations: self.warnings.append(f"Found {len(uncited_recommendations)} recommendations without citations") for rec in uncited_recommendations[:3]: # Show first 3 self.warnings.append(f" - {rec}") else: self.info.append(f"All {len(recommendations)} recommendations have citations") def check_recommendation_grading(self): """Check for GRADE-style recommendation strength.""" # Look for GRADE notation (1A, 1B, 2A, 2B, 2C) grade_pattern = r'GRADE\s*[12][A-C]|Grade\s*[12][A-C]|\(?\s*[12][A-C]\s*\)?' grades = re.findall(grade_pattern, self.content, re.IGNORECASE) # Look for strong/conditional language strong_pattern = r'(strong|we recommend|should)' conditional_pattern = r'(conditional|weak|we suggest|may consider|could consider)' strong_count = len(re.findall(strong_pattern, self.content, re.IGNORECASE)) conditional_count = len(re.findall(conditional_pattern, self.content, re.IGNORECASE)) if grades: self.info.append(f"Found {len(grades)} GRADE-style recommendations") else: self.warnings.append("No GRADE-style recommendation grading found (1A, 1B, 2A, etc.)") if strong_count > 0 or conditional_count > 0: self.info.append(f"Recommendation language: {strong_count} strong, {conditional_count} conditional") else: self.warnings.append("No clear recommendation strength language (strong/conditional) found") def check_statistical_reporting(self): """Check for proper statistical reporting.""" # Check for p-values p_values = re.findall(r'p\s*[=<>]\s*[\d.]+', self.content, re.IGNORECASE) # Check for confidence intervals ci_pattern = r'95%\s*CI|confidence interval' cis = re.findall(ci_pattern, self.content, re.IGNORECASE) # Check for hazard ratios hr_pattern = r'HR\s*[=:]\s*[\d.]+' hrs = re.findall(hr_pattern, self.content) # Check for sample sizes n_pattern = r'n\s*=\s*\d+' sample_sizes = re.findall(n_pattern, self.content, re.IGNORECASE) if not p_values: self.warnings.append("No p-values found - statistical significance not reported") else: self.info.append(f"Found {len(p_values)} p-values") if hrs and not cis: self.warnings.append("Hazard ratios reported without confidence intervals") if not sample_sizes: self.warnings.append("Sample sizes (n=X) not clearly reported") # Check for common statistical errors if 'p=0.00' in self.content or 'p = 0.00' in self.content: self.warnings.append("Found p=0.00 (should report as p<0.001 instead)") def check_hipaa_identifiers(self): """Check for potential HIPAA identifiers.""" # 18 HIPAA identifiers (simplified check for common ones) identifiers = { 'Names': r'Dr\.\s+[A-Z][a-z]+|Patient:\s*[A-Z][a-z]+', 'Specific dates': r'\d{1,2}/\d{1,2}/\d{4}', # MM/DD/YYYY 'Phone numbers': r'\d{3}[-.]?\d{3}[-.]?\d{4}', 'Email addresses': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', 'SSN': r'\d{3}-\d{2}-\d{4}', 'MRN': r'MRN\s*:?\s*\d+', } found_identifiers = [] for identifier_type, pattern in identifiers.items(): matches = re.findall(pattern, self.content) if matches: found_identifiers.append(f"{identifier_type}: {len(matches)} instance(s)") if found_identifiers: self.errors.append("Potential HIPAA identifiers detected:") for identifier in found_identifiers: self.errors.append(f" - {identifier}") self.errors.append(" ** Ensure proper de-identification before distribution **") else: self.info.append("No obvious HIPAA identifiers detected (basic check only)") def check_biomarker_nomenclature(self): """Check for consistent biomarker nomenclature.""" # Common biomarker naming issues issues = [] # Check for gene names (should be italicized in LaTeX) gene_names = ['EGFR', 'ALK', 'ROS1', 'BRAF', 'KRAS', 'HER2', 'TP53', 'BRCA1', 'BRCA2'] for gene in gene_names: # Check if gene appears but not in italics (\textit{} or \emph{}) if gene in self.content: if f'\\textit{{{gene}}}' not in self.content and f'\\emph{{{gene}}}' not in self.content: if '.tex' in self.filepath.suffix: issues.append(f"{gene} should be italicized in LaTeX (\\textit{{{gene}}})") # Check for protein vs gene naming # HER2 (protein) vs ERBB2 (gene) - both valid # Check for mutation nomenclature (HGVS format) hgvs_pattern = r'p\.[A-Z]\d+[A-Z]' # e.g., p.L858R hgvs_mutations = re.findall(hgvs_pattern, self.content) if hgvs_mutations: self.info.append(f"Found {len(hgvs_mutations)} HGVS protein nomenclature (e.g., p.L858R)") # Warn about non-standard mutation format if 'EGFR mutation' in self.content and 'exon' not in self.content.lower(): self.warnings.append("EGFR mutation mentioned - specify exon/variant (e.g., exon 19 deletion)") if issues: self.warnings.extend(issues) def generate_report(self): """Generate validation report.""" print("\n" + "="*70) print("VALIDATION REPORT") print("="*70) if self.errors: print(f"\n❌ ERRORS ({len(self.errors)}):") for error in self.errors: print(f" {error}") if self.warnings: print(f"\n⚠️ WARNINGS ({len(self.warnings)}):") for warning in self.warnings: print(f" {warning}") if self.info: print(f"\n✓ PASSED CHECKS ({len(self.info)}):") for info in self.info: print(f" {info}") # Overall status print("\n" + "="*70) if self.errors: print("STATUS: ❌ VALIDATION FAILED - Address errors before distribution") return False elif self.warnings: print("STATUS: ⚠️ VALIDATION PASSED WITH WARNINGS - Review recommended") return True else: print("STATUS: ✓ VALIDATION PASSED - Document meets quality standards") return True def save_report(self, output_file): """Save validation report to file.""" with open(output_file, 'w') as f: f.write("CLINICAL DECISION SUPPORT DOCUMENT VALIDATION REPORT\n") f.write("="*70 + "\n") f.write(f"Document: {self.filepath}\n") f.write(f"Validated: {Path.cwd()}\n\n") if self.errors: f.write(f"ERRORS ({len(self.errors)}):\n") for error in self.errors: f.write(f" - {error}\n") f.write("\n") if self.warnings: f.write(f"WARNINGS ({len(self.warnings)}):\n") for warning in self.warnings: f.write(f" - {warning}\n") f.write("\n") if self.info: f.write(f"PASSED CHECKS ({len(self.info)}):\n") for info in self.info: f.write(f" - {info}\n") print(f"\nValidation report saved to: {output_file}") def main(): parser = argparse.ArgumentParser(description='Validate clinical decision support documents') parser.add_argument('input_file', type=str, help='Document to validate (.tex, .md, .txt)') parser.add_argument('-o', '--output', type=str, default=None, help='Save validation report to file') parser.add_argument('--strict', action='store_true', help='Treat warnings as errors') args = parser.parse_args() # Validate validator = CDSValidator(args.input_file) passed = validator.validate_all() # Save report if requested if args.output: validator.save_report(args.output) # Exit code if args.strict and (validator.errors or validator.warnings): exit(1) elif validator.errors: exit(1) else: exit(0) if __name__ == '__main__': main() # Example usage: # python validate_cds_document.py cohort_analysis.tex # python validate_cds_document.py treatment_recommendations.tex -o validation_report.txt # python validate_cds_document.py document.tex --strict # Warnings cause failure