Initial commit
This commit is contained in:
335
skills/clinical-decision-support/scripts/validate_cds_document.py
Executable file
335
skills/clinical-decision-support/scripts/validate_cds_document.py
Executable file
@@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate Clinical Decision Support Documents for Quality and Completeness
|
||||
|
||||
Checks for:
|
||||
- Evidence citations for all recommendations
|
||||
- Statistical reporting completeness
|
||||
- Biomarker nomenclature consistency
|
||||
- Required sections present
|
||||
- HIPAA de-identification
|
||||
- GRADE recommendation format
|
||||
|
||||
Dependencies: None (pure Python)
|
||||
"""
|
||||
|
||||
import re
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class CDSValidator:
|
||||
"""Validator for clinical decision support documents."""
|
||||
|
||||
def __init__(self, filepath):
|
||||
self.filepath = filepath
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
self.content = f.read()
|
||||
|
||||
self.errors = []
|
||||
self.warnings = []
|
||||
self.info = []
|
||||
|
||||
def validate_all(self):
|
||||
"""Run all validation checks."""
|
||||
|
||||
print(f"Validating: {self.filepath}")
|
||||
print("="*70)
|
||||
|
||||
self.check_required_sections()
|
||||
self.check_evidence_citations()
|
||||
self.check_recommendation_grading()
|
||||
self.check_statistical_reporting()
|
||||
self.check_hipaa_identifiers()
|
||||
self.check_biomarker_nomenclature()
|
||||
|
||||
return self.generate_report()
|
||||
|
||||
def check_required_sections(self):
|
||||
"""Check if required sections are present."""
|
||||
|
||||
# Cohort analysis required sections
|
||||
cohort_sections = [
|
||||
'cohort characteristics',
|
||||
'biomarker',
|
||||
'outcomes',
|
||||
'statistical analysis',
|
||||
'clinical implications',
|
||||
'references'
|
||||
]
|
||||
|
||||
# Treatment recommendation required sections
|
||||
rec_sections = [
|
||||
'evidence',
|
||||
'recommendation',
|
||||
'monitoring',
|
||||
'references'
|
||||
]
|
||||
|
||||
content_lower = self.content.lower()
|
||||
|
||||
# Check which document type
|
||||
is_cohort = 'cohort' in content_lower
|
||||
is_recommendation = 'recommendation' in content_lower
|
||||
|
||||
if is_cohort:
|
||||
missing = [sec for sec in cohort_sections if sec not in content_lower]
|
||||
if missing:
|
||||
self.warnings.append(f"Cohort analysis may be missing sections: {', '.join(missing)}")
|
||||
else:
|
||||
self.info.append("All cohort analysis sections present")
|
||||
|
||||
if is_recommendation:
|
||||
missing = [sec for sec in rec_sections if sec not in content_lower]
|
||||
if missing:
|
||||
self.errors.append(f"Recommendation document missing required sections: {', '.join(missing)}")
|
||||
else:
|
||||
self.info.append("All recommendation sections present")
|
||||
|
||||
def check_evidence_citations(self):
|
||||
"""Check that recommendations have citations."""
|
||||
|
||||
# Find recommendation statements
|
||||
rec_pattern = r'(recommend|should|prefer|suggest|consider)(.*?)(?:\n\n|\Z)'
|
||||
recommendations = re.findall(rec_pattern, self.content, re.IGNORECASE | re.DOTALL)
|
||||
|
||||
# Find citations
|
||||
citation_patterns = [
|
||||
r'\[\d+\]', # Numbered citations [1]
|
||||
r'\(.*?\d{4}\)', # Author year (Smith 2020)
|
||||
r'et al\.', # Et al citations
|
||||
r'NCCN|ASCO|ESMO', # Guideline references
|
||||
]
|
||||
|
||||
uncited_recommendations = []
|
||||
|
||||
for i, (_, rec_text) in enumerate(recommendations):
|
||||
has_citation = any(re.search(pattern, rec_text) for pattern in citation_patterns)
|
||||
|
||||
if not has_citation:
|
||||
snippet = rec_text[:60].strip() + '...'
|
||||
uncited_recommendations.append(snippet)
|
||||
|
||||
if uncited_recommendations:
|
||||
self.warnings.append(f"Found {len(uncited_recommendations)} recommendations without citations")
|
||||
for rec in uncited_recommendations[:3]: # Show first 3
|
||||
self.warnings.append(f" - {rec}")
|
||||
else:
|
||||
self.info.append(f"All {len(recommendations)} recommendations have citations")
|
||||
|
||||
def check_recommendation_grading(self):
|
||||
"""Check for GRADE-style recommendation strength."""
|
||||
|
||||
# Look for GRADE notation (1A, 1B, 2A, 2B, 2C)
|
||||
grade_pattern = r'GRADE\s*[12][A-C]|Grade\s*[12][A-C]|\(?\s*[12][A-C]\s*\)?'
|
||||
grades = re.findall(grade_pattern, self.content, re.IGNORECASE)
|
||||
|
||||
# Look for strong/conditional language
|
||||
strong_pattern = r'(strong|we recommend|should)'
|
||||
conditional_pattern = r'(conditional|weak|we suggest|may consider|could consider)'
|
||||
|
||||
strong_count = len(re.findall(strong_pattern, self.content, re.IGNORECASE))
|
||||
conditional_count = len(re.findall(conditional_pattern, self.content, re.IGNORECASE))
|
||||
|
||||
if grades:
|
||||
self.info.append(f"Found {len(grades)} GRADE-style recommendations")
|
||||
else:
|
||||
self.warnings.append("No GRADE-style recommendation grading found (1A, 1B, 2A, etc.)")
|
||||
|
||||
if strong_count > 0 or conditional_count > 0:
|
||||
self.info.append(f"Recommendation language: {strong_count} strong, {conditional_count} conditional")
|
||||
else:
|
||||
self.warnings.append("No clear recommendation strength language (strong/conditional) found")
|
||||
|
||||
def check_statistical_reporting(self):
|
||||
"""Check for proper statistical reporting."""
|
||||
|
||||
# Check for p-values
|
||||
p_values = re.findall(r'p\s*[=<>]\s*[\d.]+', self.content, re.IGNORECASE)
|
||||
|
||||
# Check for confidence intervals
|
||||
ci_pattern = r'95%\s*CI|confidence interval'
|
||||
cis = re.findall(ci_pattern, self.content, re.IGNORECASE)
|
||||
|
||||
# Check for hazard ratios
|
||||
hr_pattern = r'HR\s*[=:]\s*[\d.]+'
|
||||
hrs = re.findall(hr_pattern, self.content)
|
||||
|
||||
# Check for sample sizes
|
||||
n_pattern = r'n\s*=\s*\d+'
|
||||
sample_sizes = re.findall(n_pattern, self.content, re.IGNORECASE)
|
||||
|
||||
if not p_values:
|
||||
self.warnings.append("No p-values found - statistical significance not reported")
|
||||
else:
|
||||
self.info.append(f"Found {len(p_values)} p-values")
|
||||
|
||||
if hrs and not cis:
|
||||
self.warnings.append("Hazard ratios reported without confidence intervals")
|
||||
|
||||
if not sample_sizes:
|
||||
self.warnings.append("Sample sizes (n=X) not clearly reported")
|
||||
|
||||
# Check for common statistical errors
|
||||
if 'p=0.00' in self.content or 'p = 0.00' in self.content:
|
||||
self.warnings.append("Found p=0.00 (should report as p<0.001 instead)")
|
||||
|
||||
def check_hipaa_identifiers(self):
|
||||
"""Check for potential HIPAA identifiers."""
|
||||
|
||||
# 18 HIPAA identifiers (simplified check for common ones)
|
||||
identifiers = {
|
||||
'Names': r'Dr\.\s+[A-Z][a-z]+|Patient:\s*[A-Z][a-z]+',
|
||||
'Specific dates': r'\d{1,2}/\d{1,2}/\d{4}', # MM/DD/YYYY
|
||||
'Phone numbers': r'\d{3}[-.]?\d{3}[-.]?\d{4}',
|
||||
'Email addresses': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
|
||||
'SSN': r'\d{3}-\d{2}-\d{4}',
|
||||
'MRN': r'MRN\s*:?\s*\d+',
|
||||
}
|
||||
|
||||
found_identifiers = []
|
||||
|
||||
for identifier_type, pattern in identifiers.items():
|
||||
matches = re.findall(pattern, self.content)
|
||||
if matches:
|
||||
found_identifiers.append(f"{identifier_type}: {len(matches)} instance(s)")
|
||||
|
||||
if found_identifiers:
|
||||
self.errors.append("Potential HIPAA identifiers detected:")
|
||||
for identifier in found_identifiers:
|
||||
self.errors.append(f" - {identifier}")
|
||||
self.errors.append(" ** Ensure proper de-identification before distribution **")
|
||||
else:
|
||||
self.info.append("No obvious HIPAA identifiers detected (basic check only)")
|
||||
|
||||
def check_biomarker_nomenclature(self):
|
||||
"""Check for consistent biomarker nomenclature."""
|
||||
|
||||
# Common biomarker naming issues
|
||||
issues = []
|
||||
|
||||
# Check for gene names (should be italicized in LaTeX)
|
||||
gene_names = ['EGFR', 'ALK', 'ROS1', 'BRAF', 'KRAS', 'HER2', 'TP53', 'BRCA1', 'BRCA2']
|
||||
for gene in gene_names:
|
||||
# Check if gene appears but not in italics (\textit{} or \emph{})
|
||||
if gene in self.content:
|
||||
if f'\\textit{{{gene}}}' not in self.content and f'\\emph{{{gene}}}' not in self.content:
|
||||
if '.tex' in self.filepath.suffix:
|
||||
issues.append(f"{gene} should be italicized in LaTeX (\\textit{{{gene}}})")
|
||||
|
||||
# Check for protein vs gene naming
|
||||
# HER2 (protein) vs ERBB2 (gene) - both valid
|
||||
# Check for mutation nomenclature (HGVS format)
|
||||
hgvs_pattern = r'p\.[A-Z]\d+[A-Z]' # e.g., p.L858R
|
||||
hgvs_mutations = re.findall(hgvs_pattern, self.content)
|
||||
|
||||
if hgvs_mutations:
|
||||
self.info.append(f"Found {len(hgvs_mutations)} HGVS protein nomenclature (e.g., p.L858R)")
|
||||
|
||||
# Warn about non-standard mutation format
|
||||
if 'EGFR mutation' in self.content and 'exon' not in self.content.lower():
|
||||
self.warnings.append("EGFR mutation mentioned - specify exon/variant (e.g., exon 19 deletion)")
|
||||
|
||||
if issues:
|
||||
self.warnings.extend(issues)
|
||||
|
||||
def generate_report(self):
|
||||
"""Generate validation report."""
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("VALIDATION REPORT")
|
||||
print("="*70)
|
||||
|
||||
if self.errors:
|
||||
print(f"\n❌ ERRORS ({len(self.errors)}):")
|
||||
for error in self.errors:
|
||||
print(f" {error}")
|
||||
|
||||
if self.warnings:
|
||||
print(f"\n⚠️ WARNINGS ({len(self.warnings)}):")
|
||||
for warning in self.warnings:
|
||||
print(f" {warning}")
|
||||
|
||||
if self.info:
|
||||
print(f"\n✓ PASSED CHECKS ({len(self.info)}):")
|
||||
for info in self.info:
|
||||
print(f" {info}")
|
||||
|
||||
# Overall status
|
||||
print("\n" + "="*70)
|
||||
if self.errors:
|
||||
print("STATUS: ❌ VALIDATION FAILED - Address errors before distribution")
|
||||
return False
|
||||
elif self.warnings:
|
||||
print("STATUS: ⚠️ VALIDATION PASSED WITH WARNINGS - Review recommended")
|
||||
return True
|
||||
else:
|
||||
print("STATUS: ✓ VALIDATION PASSED - Document meets quality standards")
|
||||
return True
|
||||
|
||||
def save_report(self, output_file):
|
||||
"""Save validation report to file."""
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
f.write("CLINICAL DECISION SUPPORT DOCUMENT VALIDATION REPORT\n")
|
||||
f.write("="*70 + "\n")
|
||||
f.write(f"Document: {self.filepath}\n")
|
||||
f.write(f"Validated: {Path.cwd()}\n\n")
|
||||
|
||||
if self.errors:
|
||||
f.write(f"ERRORS ({len(self.errors)}):\n")
|
||||
for error in self.errors:
|
||||
f.write(f" - {error}\n")
|
||||
f.write("\n")
|
||||
|
||||
if self.warnings:
|
||||
f.write(f"WARNINGS ({len(self.warnings)}):\n")
|
||||
for warning in self.warnings:
|
||||
f.write(f" - {warning}\n")
|
||||
f.write("\n")
|
||||
|
||||
if self.info:
|
||||
f.write(f"PASSED CHECKS ({len(self.info)}):\n")
|
||||
for info in self.info:
|
||||
f.write(f" - {info}\n")
|
||||
|
||||
print(f"\nValidation report saved to: {output_file}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Validate clinical decision support documents')
|
||||
parser.add_argument('input_file', type=str, help='Document to validate (.tex, .md, .txt)')
|
||||
parser.add_argument('-o', '--output', type=str, default=None,
|
||||
help='Save validation report to file')
|
||||
parser.add_argument('--strict', action='store_true',
|
||||
help='Treat warnings as errors')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate
|
||||
validator = CDSValidator(args.input_file)
|
||||
passed = validator.validate_all()
|
||||
|
||||
# Save report if requested
|
||||
if args.output:
|
||||
validator.save_report(args.output)
|
||||
|
||||
# Exit code
|
||||
if args.strict and (validator.errors or validator.warnings):
|
||||
exit(1)
|
||||
elif validator.errors:
|
||||
exit(1)
|
||||
else:
|
||||
exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
# Example usage:
|
||||
# python validate_cds_document.py cohort_analysis.tex
|
||||
# python validate_cds_document.py treatment_recommendations.tex -o validation_report.txt
|
||||
# python validate_cds_document.py document.tex --strict # Warnings cause failure
|
||||
|
||||
Reference in New Issue
Block a user