Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions

View File

@@ -0,0 +1,384 @@
#!/usr/bin/env python3
"""
Biomarker-Based Patient Stratification and Classification
Performs patient stratification based on biomarker profiles with:
- Binary classification (biomarker+/-)
- Multi-class molecular subtypes
- Continuous biomarker scoring
- Correlation with clinical outcomes
Dependencies: pandas, numpy, scipy, scikit-learn (optional for clustering)
"""
import pandas as pd
import numpy as np
from scipy import stats
import argparse
from pathlib import Path
def classify_binary_biomarker(data, biomarker_col, threshold,
above_label='Biomarker+', below_label='Biomarker-'):
"""
Binary classification based on biomarker threshold.
Parameters:
data: DataFrame
biomarker_col: Column name for biomarker values
threshold: Cut-point value
above_label: Label for values >= threshold
below_label: Label for values < threshold
Returns:
DataFrame with added 'biomarker_class' column
"""
data = data.copy()
data['biomarker_class'] = data[biomarker_col].apply(
lambda x: above_label if x >= threshold else below_label
)
return data
def classify_pd_l1_tps(data, pd_l1_col='pd_l1_tps'):
"""
Classify PD-L1 Tumor Proportion Score into clinical categories.
Categories:
- Negative: <1%
- Low: 1-49%
- High: >=50%
Returns:
DataFrame with 'pd_l1_category' column
"""
data = data.copy()
def categorize(tps):
if tps < 1:
return 'PD-L1 Negative (<1%)'
elif tps < 50:
return 'PD-L1 Low (1-49%)'
else:
return 'PD-L1 High (≥50%)'
data['pd_l1_category'] = data[pd_l1_col].apply(categorize)
# Distribution
print("\nPD-L1 TPS Distribution:")
print(data['pd_l1_category'].value_counts())
return data
def classify_her2_status(data, ihc_col='her2_ihc', fish_col='her2_fish'):
"""
Classify HER2 status based on IHC and FISH results (ASCO/CAP guidelines).
IHC Scores: 0, 1+, 2+, 3+
FISH: Positive, Negative (if IHC 2+)
Classification:
- HER2-positive: IHC 3+ OR IHC 2+/FISH+
- HER2-negative: IHC 0/1+ OR IHC 2+/FISH-
- HER2-low: IHC 1+ or IHC 2+/FISH- (subset of HER2-negative)
Returns:
DataFrame with 'her2_status' and 'her2_low' columns
"""
data = data.copy()
def classify_her2(row):
ihc = row[ihc_col]
fish = row.get(fish_col, None)
if ihc == '3+':
status = 'HER2-positive'
her2_low = False
elif ihc == '2+':
if fish == 'Positive':
status = 'HER2-positive'
her2_low = False
elif fish == 'Negative':
status = 'HER2-negative'
her2_low = True # HER2-low
else:
status = 'HER2-equivocal (FISH needed)'
her2_low = False
elif ihc == '1+':
status = 'HER2-negative'
her2_low = True # HER2-low
else: # IHC 0
status = 'HER2-negative'
her2_low = False
return pd.Series({'her2_status': status, 'her2_low': her2_low})
data[['her2_status', 'her2_low']] = data.apply(classify_her2, axis=1)
print("\nHER2 Status Distribution:")
print(data['her2_status'].value_counts())
print(f"\nHER2-low (IHC 1+ or 2+/FISH-): {data['her2_low'].sum()} patients")
return data
def classify_breast_cancer_subtype(data, er_col='er_positive', pr_col='pr_positive',
her2_col='her2_positive'):
"""
Classify breast cancer into molecular subtypes.
Subtypes:
- HR+/HER2-: Luminal (ER+ and/or PR+, HER2-)
- HER2+: Any HER2-positive (regardless of HR status)
- Triple-negative: ER-, PR-, HER2-
Returns:
DataFrame with 'bc_subtype' column
"""
data = data.copy()
def get_subtype(row):
er = row[er_col]
pr = row[pr_col]
her2 = row[her2_col]
if her2:
if er or pr:
return 'HR+/HER2+ (Luminal B HER2+)'
else:
return 'HR-/HER2+ (HER2-enriched)'
elif er or pr:
return 'HR+/HER2- (Luminal)'
else:
return 'Triple-Negative'
data['bc_subtype'] = data.apply(get_subtype, axis=1)
print("\nBreast Cancer Subtype Distribution:")
print(data['bc_subtype'].value_counts())
return data
def correlate_biomarker_outcome(data, biomarker_col, outcome_col, biomarker_type='binary'):
"""
Assess correlation between biomarker and clinical outcome.
Parameters:
biomarker_col: Biomarker variable
outcome_col: Outcome variable
biomarker_type: 'binary', 'categorical', 'continuous'
Returns:
Statistical test results
"""
print(f"\nCorrelation Analysis: {biomarker_col} vs {outcome_col}")
print("="*60)
# Remove missing data
analysis_data = data[[biomarker_col, outcome_col]].dropna()
if biomarker_type == 'binary' or biomarker_type == 'categorical':
# Cross-tabulation
contingency = pd.crosstab(analysis_data[biomarker_col], analysis_data[outcome_col])
print("\nContingency Table:")
print(contingency)
# Chi-square test
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
print(f"\nChi-square test:")
print(f" χ² = {chi2:.2f}, df = {dof}, p = {p_value:.4f}")
# Odds ratio if 2x2 table
if contingency.shape == (2, 2):
a, b = contingency.iloc[0, :]
c, d = contingency.iloc[1, :]
or_value = (a * d) / (b * c) if b * c > 0 else np.inf
# Confidence interval for OR (log method)
log_or = np.log(or_value)
se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
ci_lower = np.exp(log_or - 1.96 * se_log_or)
ci_upper = np.exp(log_or + 1.96 * se_log_or)
print(f"\nOdds Ratio: {or_value:.2f} (95% CI {ci_lower:.2f}-{ci_upper:.2f})")
elif biomarker_type == 'continuous':
# Correlation coefficient
r, p_value = stats.pearsonr(analysis_data[biomarker_col], analysis_data[outcome_col])
print(f"\nPearson correlation:")
print(f" r = {r:.3f}, p = {p_value:.4f}")
# Also report Spearman for robustness
rho, p_spearman = stats.spearmanr(analysis_data[biomarker_col], analysis_data[outcome_col])
print(f"Spearman correlation:")
print(f" ρ = {rho:.3f}, p = {p_spearman:.4f}")
return p_value
def stratify_cohort_report(data, stratification_var, output_dir='stratification_report'):
"""
Generate comprehensive stratification report.
Parameters:
data: DataFrame with patient data
stratification_var: Column name for stratification
output_dir: Output directory for reports
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"\nCOHORT STRATIFICATION REPORT")
print("="*60)
print(f"Stratification Variable: {stratification_var}")
print(f"Total Patients: {len(data)}")
# Group distribution
distribution = data[stratification_var].value_counts()
print(f"\nGroup Distribution:")
for group, count in distribution.items():
pct = count / len(data) * 100
print(f" {group}: {count} ({pct:.1f}%)")
# Save distribution
distribution.to_csv(output_dir / 'group_distribution.csv')
# Compare baseline characteristics across groups
print(f"\nBaseline Characteristics by {stratification_var}:")
results = []
# Continuous variables
continuous_vars = data.select_dtypes(include=[np.number]).columns.tolist()
continuous_vars = [v for v in continuous_vars if v != stratification_var]
for var in continuous_vars[:5]: # Limit to first 5 for demo
print(f"\n{var}:")
for group in distribution.index:
group_data = data[data[stratification_var] == group][var].dropna()
print(f" {group}: median {group_data.median():.1f} [IQR {group_data.quantile(0.25):.1f}-{group_data.quantile(0.75):.1f}]")
# Statistical test
if len(distribution) == 2:
groups_list = distribution.index.tolist()
g1 = data[data[stratification_var] == groups_list[0]][var].dropna()
g2 = data[data[stratification_var] == groups_list[1]][var].dropna()
_, p_value = stats.mannwhitneyu(g1, g2, alternative='two-sided')
print(f" p-value: {p_value:.4f}")
results.append({
'Variable': var,
'Test': 'Mann-Whitney U',
'p_value': p_value,
'Significant': 'Yes' if p_value < 0.05 else 'No'
})
# Save results
if results:
df_results = pd.DataFrame(results)
df_results.to_csv(output_dir / 'statistical_comparisons.csv', index=False)
print(f"\nStatistical comparison results saved to: {output_dir}/statistical_comparisons.csv")
print(f"\nStratification report complete! Files saved to {output_dir}/")
def main():
parser = argparse.ArgumentParser(description='Biomarker-based patient classification')
parser.add_argument('input_file', type=str, nargs='?', default=None,
help='CSV file with patient and biomarker data')
parser.add_argument('-b', '--biomarker', type=str, default=None,
help='Biomarker column name for stratification')
parser.add_argument('-t', '--threshold', type=float, default=None,
help='Threshold for binary classification')
parser.add_argument('-o', '--output-dir', type=str, default='stratification',
help='Output directory')
parser.add_argument('--example', action='store_true',
help='Run with example data')
args = parser.parse_args()
# Example data if requested
if args.example or args.input_file is None:
print("Generating example dataset...")
np.random.seed(42)
n = 80
data = pd.DataFrame({
'patient_id': [f'PT{i:03d}' for i in range(1, n+1)],
'age': np.random.normal(62, 10, n),
'sex': np.random.choice(['Male', 'Female'], n),
'pd_l1_tps': np.random.exponential(20, n), # Exponential distribution for PD-L1
'tmb': np.random.exponential(8, n), # Mutations per Mb
'her2_ihc': np.random.choice(['0', '1+', '2+', '3+'], n, p=[0.6, 0.2, 0.15, 0.05]),
'response': np.random.choice(['Yes', 'No'], n, p=[0.4, 0.6]),
})
# Simulate correlation: higher PD-L1 -> better response
data.loc[data['pd_l1_tps'] >= 50, 'response'] = np.random.choice(['Yes', 'No'],
(data['pd_l1_tps'] >= 50).sum(),
p=[0.65, 0.35])
else:
print(f"Loading data from {args.input_file}...")
data = pd.read_csv(args.input_file)
print(f"Dataset: {len(data)} patients")
print(f"Columns: {list(data.columns)}")
# PD-L1 classification example
if 'pd_l1_tps' in data.columns or args.biomarker == 'pd_l1_tps':
data = classify_pd_l1_tps(data, 'pd_l1_tps')
# Correlate with response if available
if 'response' in data.columns:
correlate_biomarker_outcome(data, 'pd_l1_category', 'response', biomarker_type='categorical')
# HER2 classification if columns present
if 'her2_ihc' in data.columns:
if 'her2_fish' not in data.columns:
# Add placeholder FISH for IHC 2+
data['her2_fish'] = np.nan
data = classify_her2_status(data, 'her2_ihc', 'her2_fish')
# Generic binary classification if threshold provided
if args.biomarker and args.threshold is not None:
print(f"\nBinary classification: {args.biomarker} with threshold {args.threshold}")
data = classify_binary_biomarker(data, args.biomarker, args.threshold)
print(data['biomarker_class'].value_counts())
# Generate stratification report
if args.biomarker:
stratify_cohort_report(data, args.biomarker, output_dir=args.output_dir)
elif 'pd_l1_category' in data.columns:
stratify_cohort_report(data, 'pd_l1_category', output_dir=args.output_dir)
# Save classified data
output_path = Path(args.output_dir) / 'classified_data.csv'
data.to_csv(output_path, index=False)
print(f"\nClassified data saved to: {output_path}")
if __name__ == '__main__':
main()
# Example usage:
# python biomarker_classifier.py data.csv -b pd_l1_tps -t 50 -o classification/
# python biomarker_classifier.py --example
#
# Input CSV format:
# patient_id,pd_l1_tps,tmb,her2_ihc,response,pfs_months,event
# PT001,55.5,12.3,1+,Yes,14.2,1
# PT002,8.2,5.1,0,No,6.5,1
# ...

View File

@@ -0,0 +1,447 @@
#!/usr/bin/env python3
"""
Build Clinical Decision Tree Flowcharts in TikZ Format
Generates LaTeX/TikZ code for clinical decision algorithms from
simple text or YAML descriptions.
Dependencies: pyyaml (optional, for YAML input)
"""
import argparse
from pathlib import Path
import json
class DecisionNode:
"""Represents a decision point in the clinical algorithm."""
def __init__(self, question, yes_path=None, no_path=None, node_id=None):
self.question = question
self.yes_path = yes_path
self.no_path = no_path
self.node_id = node_id or self._generate_id(question)
def _generate_id(self, text):
"""Generate clean node ID from text."""
return ''.join(c for c in text if c.isalnum())[:15].lower()
class ActionNode:
"""Represents an action/outcome in the clinical algorithm."""
def __init__(self, action, urgency='routine', node_id=None):
self.action = action
self.urgency = urgency # 'urgent', 'semiurgent', 'routine'
self.node_id = node_id or self._generate_id(action)
def _generate_id(self, text):
return ''.join(c for c in text if c.isalnum())[:15].lower()
def generate_tikz_header():
"""Generate TikZ preamble with style definitions."""
tikz = """\\documentclass[10pt]{article}
\\usepackage[margin=0.5in, landscape]{geometry}
\\usepackage{tikz}
\\usetikzlibrary{shapes,arrows,positioning}
\\usepackage{xcolor}
% Color definitions
\\definecolor{urgentred}{RGB}{220,20,60}
\\definecolor{actiongreen}{RGB}{0,153,76}
\\definecolor{decisionyellow}{RGB}{255,193,7}
\\definecolor{routineblue}{RGB}{100,181,246}
\\definecolor{headerblue}{RGB}{0,102,204}
% TikZ styles
\\tikzstyle{startstop} = [rectangle, rounded corners=8pt, minimum width=3cm, minimum height=1cm,
text centered, draw=black, fill=headerblue!20, font=\\small\\bfseries]
\\tikzstyle{decision} = [diamond, minimum width=3cm, minimum height=1.2cm, text centered,
draw=black, fill=decisionyellow!40, font=\\small, aspect=2, inner sep=0pt,
text width=3.5cm]
\\tikzstyle{process} = [rectangle, rounded corners=4pt, minimum width=3.5cm, minimum height=0.9cm,
text centered, draw=black, fill=actiongreen!20, font=\\small]
\\tikzstyle{urgent} = [rectangle, rounded corners=4pt, minimum width=3.5cm, minimum height=0.9cm,
text centered, draw=urgentred, line width=1.5pt, fill=urgentred!15,
font=\\small\\bfseries]
\\tikzstyle{routine} = [rectangle, rounded corners=4pt, minimum width=3.5cm, minimum height=0.9cm,
text centered, draw=black, fill=routineblue!20, font=\\small]
\\tikzstyle{arrow} = [thick,->,>=stealth]
\\tikzstyle{urgentarrow} = [ultra thick,->,>=stealth,color=urgentred]
\\begin{document}
\\begin{center}
{\\Large\\bfseries Clinical Decision Algorithm}\\\\[10pt]
{\\large [TITLE TO BE SPECIFIED]}
\\end{center}
\\vspace{10pt}
\\begin{tikzpicture}[node distance=2.2cm and 3.5cm, auto]
"""
return tikz
def generate_tikz_footer():
"""Generate TikZ closing code."""
tikz = """
\\end{tikzpicture}
\\end{document}
"""
return tikz
def simple_algorithm_to_tikz(algorithm_text, output_file='algorithm.tex'):
"""
Convert simple text-based algorithm to TikZ flowchart.
Input format (simple question-action pairs):
START: Chief complaint
Q1: High-risk criteria present? -> YES: Immediate action (URGENT) | NO: Continue
Q2: Risk score >= 3? -> YES: Admit ICU | NO: Outpatient management (ROUTINE)
END: Final outcome
Parameters:
algorithm_text: Multi-line string with algorithm
output_file: Path to save .tex file
"""
tikz_code = generate_tikz_header()
# Parse algorithm text
lines = [line.strip() for line in algorithm_text.strip().split('\n') if line.strip()]
node_defs = []
arrow_defs = []
previous_node = None
node_counter = 0
for line in lines:
if line.startswith('START:'):
# Start node
text = line.replace('START:', '').strip()
node_id = 'start'
node_defs.append(f"\\node [startstop] ({node_id}) {{{text}}};")
previous_node = node_id
node_counter += 1
elif line.startswith('END:'):
# End node
text = line.replace('END:', '').strip()
node_id = 'end'
# Position relative to previous
if previous_node:
node_defs.append(f"\\node [startstop, below=of {previous_node}] ({node_id}) {{{text}}};")
arrow_defs.append(f"\\draw [arrow] ({previous_node}) -- ({node_id});")
elif line.startswith('Q'):
# Decision node
parts = line.split(':', 1)
if len(parts) < 2:
continue
question_part = parts[1].split('->')[0].strip()
node_id = f'q{node_counter}'
# Add decision node
if previous_node:
node_defs.append(f"\\node [decision, below=of {previous_node}] ({node_id}) {{{question_part}}};")
arrow_defs.append(f"\\draw [arrow] ({previous_node}) -- ({node_id});")
else:
node_defs.append(f"\\node [decision] ({node_id}) {{{question_part}}};")
# Parse YES and NO branches
if '->' in line:
branches = line.split('->')[1].split('|')
for branch in branches:
branch = branch.strip()
if branch.startswith('YES:'):
yes_action = branch.replace('YES:', '').strip()
yes_id = f'yes{node_counter}'
# Check urgency
if '(URGENT)' in yes_action:
style = 'urgent'
yes_action = yes_action.replace('(URGENT)', '').strip()
arrow_style = 'urgentarrow'
elif '(ROUTINE)' in yes_action:
style = 'routine'
yes_action = yes_action.replace('(ROUTINE)', '').strip()
arrow_style = 'arrow'
else:
style = 'process'
arrow_style = 'arrow'
node_defs.append(f"\\node [{style}, left=of {node_id}] ({yes_id}) {{{yes_action}}};")
arrow_defs.append(f"\\draw [{arrow_style}] ({node_id}) -- node[above] {{Yes}} ({yes_id});")
elif branch.startswith('NO:'):
no_action = branch.replace('NO:', '').strip()
no_id = f'no{node_counter}'
# Check urgency
if '(URGENT)' in no_action:
style = 'urgent'
no_action = no_action.replace('(URGENT)', '').strip()
arrow_style = 'urgentarrow'
elif '(ROUTINE)' in no_action:
style = 'routine'
no_action = no_action.replace('(ROUTINE)', '').strip()
arrow_style = 'arrow'
else:
style = 'process'
arrow_style = 'arrow'
node_defs.append(f"\\node [{style}, right=of {node_id}] ({no_id}) {{{no_action}}};")
arrow_defs.append(f"\\draw [{arrow_style}] ({node_id}) -- node[above] {{No}} ({no_id});")
previous_node = node_id
node_counter += 1
# Add all nodes and arrows to TikZ
tikz_code += '\n'.join(node_defs) + '\n\n'
tikz_code += '% Arrows\n'
tikz_code += '\n'.join(arrow_defs) + '\n'
tikz_code += generate_tikz_footer()
# Save to file
with open(output_file, 'w') as f:
f.write(tikz_code)
print(f"TikZ flowchart saved to: {output_file}")
print(f"Compile with: pdflatex {output_file}")
return tikz_code
def json_to_tikz(json_file, output_file='algorithm.tex'):
"""
Convert JSON decision tree specification to TikZ flowchart.
JSON format:
{
"title": "Algorithm Title",
"nodes": {
"start": {"type": "start", "text": "Patient presentation"},
"q1": {"type": "decision", "text": "Criteria met?", "yes": "action1", "no": "q2"},
"action1": {"type": "action", "text": "Immediate intervention", "urgency": "urgent"},
"q2": {"type": "decision", "text": "Score >= 3?", "yes": "action2", "no": "action3"},
"action2": {"type": "action", "text": "Admit ICU"},
"action3": {"type": "action", "text": "Outpatient", "urgency": "routine"}
},
"start_node": "start"
}
"""
with open(json_file, 'r') as f:
spec = json.load(f)
tikz_code = generate_tikz_header()
# Replace title
title = spec.get('title', 'Clinical Decision Algorithm')
tikz_code = tikz_code.replace('[TITLE TO BE SPECIFIED]', title)
nodes = spec['nodes']
start_node = spec.get('start_node', 'start')
# Generate nodes (simplified layout - vertical)
node_defs = []
arrow_defs = []
# Track positioning
previous_node = None
level = 0
def add_node(node_id, position_rel=None):
"""Recursively add nodes."""
if node_id not in nodes:
return
node = nodes[node_id]
node_type = node['type']
text = node['text']
# Determine TikZ style
if node_type == 'start' or node_type == 'end':
style = 'startstop'
elif node_type == 'decision':
style = 'decision'
elif node_type == 'action':
urgency = node.get('urgency', 'normal')
if urgency == 'urgent':
style = 'urgent'
elif urgency == 'routine':
style = 'routine'
else:
style = 'process'
else:
style = 'process'
# Position node
if position_rel:
node_def = f"\\node [{style}, {position_rel}] ({node_id}) {{{text}}};"
else:
node_def = f"\\node [{style}] ({node_id}) {{{text}}};"
node_defs.append(node_def)
# Add arrows for decision nodes
if node_type == 'decision':
yes_target = node.get('yes')
no_target = node.get('no')
if yes_target:
# Determine arrow style based on target urgency
target_node = nodes.get(yes_target, {})
arrow_style = 'urgentarrow' if target_node.get('urgency') == 'urgent' else 'arrow'
arrow_defs.append(f"\\draw [{arrow_style}] ({node_id}) -| node[near start, above] {{Yes}} ({yes_target});")
if no_target:
target_node = nodes.get(no_target, {})
arrow_style = 'urgentarrow' if target_node.get('urgency') == 'urgent' else 'arrow'
arrow_defs.append(f"\\draw [{arrow_style}] ({node_id}) -| node[near start, above] {{No}} ({no_target});")
# Simple layout - just list nodes (manual positioning in JSON works better for complex trees)
for node_id in nodes.keys():
add_node(node_id)
tikz_code += '\n'.join(node_defs) + '\n\n'
tikz_code += '% Arrows\n'
tikz_code += '\n'.join(arrow_defs) + '\n'
tikz_code += generate_tikz_footer()
# Save
with open(output_file, 'w') as f:
f.write(tikz_code)
print(f"TikZ flowchart saved to: {output_file}")
return tikz_code
def create_example_json():
"""Create example JSON specification for testing."""
example = {
"title": "Acute Chest Pain Management Algorithm",
"nodes": {
"start": {
"type": "start",
"text": "Patient with\\nchest pain"
},
"q1": {
"type": "decision",
"text": "STEMI\\ncriteria?",
"yes": "stemi_action",
"no": "q2"
},
"stemi_action": {
"type": "action",
"text": "Activate cath lab\\nAspirin, heparin\\nPrimary PCI",
"urgency": "urgent"
},
"q2": {
"type": "decision",
"text": "High-risk\\nfeatures?",
"yes": "admit",
"no": "q3"
},
"admit": {
"type": "action",
"text": "Admit CCU\\nSerial troponins\\nEarly angiography"
},
"q3": {
"type": "decision",
"text": "TIMI\\nscore 0-1?",
"yes": "lowrisk",
"no": "moderate"
},
"lowrisk": {
"type": "action",
"text": "Observe 6-12h\\nStress test\\nOutpatient f/u",
"urgency": "routine"
},
"moderate": {
"type": "action",
"text": "Admit telemetry\\nMedical management\\nRisk stratification"
}
},
"start_node": "start"
}
return example
def main():
parser = argparse.ArgumentParser(description='Build clinical decision tree flowcharts')
parser.add_argument('-i', '--input', type=str, default=None,
help='Input file (JSON format)')
parser.add_argument('-o', '--output', type=str, default='clinical_algorithm.tex',
help='Output .tex file')
parser.add_argument('--example', action='store_true',
help='Generate example algorithm')
parser.add_argument('--text', type=str, default=None,
help='Simple text algorithm (see format in docstring)')
args = parser.parse_args()
if args.example:
print("Generating example algorithm...")
example_spec = create_example_json()
# Save example JSON
with open('example_algorithm.json', 'w') as f:
json.dump(example_spec, f, indent=2)
print("Example JSON saved to: example_algorithm.json")
# Generate TikZ from example
json_to_tikz('example_algorithm.json', args.output)
elif args.text:
print("Generating algorithm from text...")
simple_algorithm_to_tikz(args.text, args.output)
elif args.input:
print(f"Generating algorithm from {args.input}...")
if args.input.endswith('.json'):
json_to_tikz(args.input, args.output)
else:
with open(args.input, 'r') as f:
text = f.read()
simple_algorithm_to_tikz(text, args.output)
else:
print("No input provided. Use --example to generate example, --text for simple text, or -i for JSON input.")
print("\nSimple text format:")
print("START: Patient presentation")
print("Q1: Criteria met? -> YES: Action (URGENT) | NO: Continue")
print("Q2: Score >= 3? -> YES: Admit | NO: Outpatient (ROUTINE)")
print("END: Follow-up")
if __name__ == '__main__':
main()
# Example usage:
# python build_decision_tree.py --example
# python build_decision_tree.py -i algorithm_spec.json -o my_algorithm.tex
#
# Then compile:
# pdflatex clinical_algorithm.tex

View File

@@ -0,0 +1,524 @@
#!/usr/bin/env python3
"""
Generate Clinical Cohort Tables for Baseline Characteristics and Outcomes
Creates publication-ready tables with:
- Baseline demographics (Table 1 style)
- Efficacy outcomes
- Safety/adverse events
- Statistical comparisons between groups
Dependencies: pandas, numpy, scipy
"""
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path
import argparse
def calculate_p_value(data, variable, group_col='group', var_type='categorical'):
"""
Calculate appropriate p-value for group comparison.
Parameters:
data: DataFrame
variable: Column name to compare
group_col: Grouping variable
var_type: 'categorical', 'continuous_normal', 'continuous_nonnormal'
Returns:
p-value (float)
"""
groups = data[group_col].unique()
if len(groups) != 2:
return np.nan # Only handle 2-group comparisons
group1_data = data[data[group_col] == groups[0]][variable].dropna()
group2_data = data[data[group_col] == groups[1]][variable].dropna()
if var_type == 'categorical':
# Chi-square or Fisher's exact test
contingency = pd.crosstab(data[variable], data[group_col])
# Check if Fisher's exact is needed (expected count < 5)
if contingency.min().min() < 5:
# Fisher's exact (2x2 only)
if contingency.shape == (2, 2):
_, p_value = stats.fisher_exact(contingency)
else:
# Use chi-square but note limitation
_, p_value, _, _ = stats.chi2_contingency(contingency)
else:
_, p_value, _, _ = stats.chi2_contingency(contingency)
elif var_type == 'continuous_normal':
# Independent t-test
_, p_value = stats.ttest_ind(group1_data, group2_data, equal_var=False)
elif var_type == 'continuous_nonnormal':
# Mann-Whitney U test
_, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative='two-sided')
else:
raise ValueError("var_type must be 'categorical', 'continuous_normal', or 'continuous_nonnormal'")
return p_value
def format_continuous_variable(data, variable, group_col, distribution='normal'):
"""
Format continuous variable for table display.
Returns:
Dictionary with formatted strings for each group and p-value
"""
groups = data[group_col].unique()
results = {}
for group in groups:
group_data = data[data[group_col] == group][variable].dropna()
if distribution == 'normal':
# Mean ± SD
mean = group_data.mean()
std = group_data.std()
results[group] = f"{mean:.1f} ± {std:.1f}"
else:
# Median [IQR]
median = group_data.median()
q1 = group_data.quantile(0.25)
q3 = group_data.quantile(0.75)
results[group] = f"{median:.1f} [{q1:.1f}-{q3:.1f}]"
# Calculate p-value
var_type = 'continuous_normal' if distribution == 'normal' else 'continuous_nonnormal'
p_value = calculate_p_value(data, variable, group_col, var_type)
results['p_value'] = f"{p_value:.3f}" if p_value < 0.001 else f"{p_value:.2f}" if p_value < 1.0 else ""
return results
def format_categorical_variable(data, variable, group_col):
"""
Format categorical variable for table display.
Returns:
List of dictionaries for each category with counts and percentages
"""
groups = data[group_col].unique()
categories = data[variable].dropna().unique()
results = []
for category in categories:
row = {'category': category}
for group in groups:
group_data = data[data[group_col] == group]
count = (group_data[variable] == category).sum()
total = group_data[variable].notna().sum()
percentage = (count / total * 100) if total > 0 else 0
row[group] = f"{count} ({percentage:.0f}%)"
results.append(row)
# Calculate p-value for overall categorical variable
p_value = calculate_p_value(data, variable, group_col, 'categorical')
results[0]['p_value'] = f"{p_value:.3f}" if p_value < 0.001 else f"{p_value:.2f}" if p_value < 1.0 else ""
return results
def generate_baseline_table(data, group_col='group', output_file='table1_baseline.csv'):
"""
Generate Table 1: Baseline characteristics.
Customize the variables list for your specific cohort.
"""
groups = data[group_col].unique()
# Initialize results list
table_rows = []
# Header row
header = {
'Characteristic': 'Characteristic',
**{group: f"{group} (n={len(data[data[group_col]==group])})" for group in groups},
'p_value': 'p-value'
}
table_rows.append(header)
# Age (continuous)
if 'age' in data.columns:
age_results = format_continuous_variable(data, 'age', group_col, distribution='nonnormal')
row = {'Characteristic': 'Age, years (median [IQR])'}
for group in groups:
row[group] = age_results[group]
row['p_value'] = age_results['p_value']
table_rows.append(row)
# Sex (categorical)
if 'sex' in data.columns:
table_rows.append({'Characteristic': 'Sex, n (%)', **{g: '' for g in groups}, 'p_value': ''})
sex_results = format_categorical_variable(data, 'sex', group_col)
for sex_row in sex_results:
row = {'Characteristic': f" {sex_row['category']}"}
for group in groups:
row[group] = sex_row[group]
row['p_value'] = sex_row.get('p_value', '')
table_rows.append(row)
# ECOG Performance Status (categorical)
if 'ecog_ps' in data.columns:
table_rows.append({'Characteristic': 'ECOG PS, n (%)', **{g: '' for g in groups}, 'p_value': ''})
ecog_results = format_categorical_variable(data, 'ecog_ps', group_col)
for ecog_row in ecog_results:
row = {'Characteristic': f" {ecog_row['category']}"}
for group in groups:
row[group] = ecog_row[group]
row['p_value'] = ecog_row.get('p_value', '')
table_rows.append(row)
# Convert to DataFrame and save
df_table = pd.DataFrame(table_rows)
df_table.to_csv(output_file, index=False)
print(f"Baseline characteristics table saved to: {output_file}")
return df_table
def generate_efficacy_table(data, group_col='group', output_file='table2_efficacy.csv'):
"""
Generate efficacy outcomes table.
Expected columns:
- best_response: CR, PR, SD, PD
- Additional binary outcomes (response, disease_control, etc.)
"""
groups = data[group_col].unique()
table_rows = []
# Header
header = {
'Outcome': 'Outcome',
**{group: f"{group} (n={len(data[data[group_col]==group])})" for group in groups},
'p_value': 'p-value'
}
table_rows.append(header)
# Objective Response Rate (ORR = CR + PR)
if 'best_response' in data.columns:
for group in groups:
group_data = data[data[group_col] == group]
cr_pr = ((group_data['best_response'] == 'CR') | (group_data['best_response'] == 'PR')).sum()
total = len(group_data)
orr = cr_pr / total * 100
# Calculate exact binomial CI (Clopper-Pearson)
ci_lower, ci_upper = _binomial_ci(cr_pr, total)
if group == groups[0]:
orr_row = {'Outcome': 'ORR, n (%) [95% CI]'}
orr_row[group] = f"{cr_pr} ({orr:.0f}%) [{ci_lower:.0f}-{ci_upper:.0f}]"
# P-value for ORR difference
contingency = pd.crosstab(
data['best_response'].isin(['CR', 'PR']),
data[group_col]
)
_, p_value, _, _ = stats.chi2_contingency(contingency)
orr_row['p_value'] = f"{p_value:.3f}" if p_value >= 0.001 else "<0.001"
table_rows.append(orr_row)
# Individual response categories
for response in ['CR', 'PR', 'SD', 'PD']:
row = {'Outcome': f" {response}"}
for group in groups:
group_data = data[data[group_col] == group]
count = (group_data['best_response'] == response).sum()
total = len(group_data)
pct = count / total * 100
row[group] = f"{count} ({pct:.0f}%)"
row['p_value'] = ''
table_rows.append(row)
# Disease Control Rate (DCR = CR + PR + SD)
if 'best_response' in data.columns:
dcr_row = {'Outcome': 'DCR, n (%) [95% CI]'}
for group in groups:
group_data = data[data[group_col] == group]
dcr_count = group_data['best_response'].isin(['CR', 'PR', 'SD']).sum()
total = len(group_data)
dcr = dcr_count / total * 100
ci_lower, ci_upper = _binomial_ci(dcr_count, total)
dcr_row[group] = f"{dcr_count} ({dcr:.0f}%) [{ci_lower:.0f}-{ci_upper:.0f}]"
# P-value
contingency = pd.crosstab(
data['best_response'].isin(['CR', 'PR', 'SD']),
data[group_col]
)
_, p_value, _, _ = stats.chi2_contingency(contingency)
dcr_row['p_value'] = f"{p_value:.3f}" if p_value >= 0.001 else "<0.001"
table_rows.append(dcr_row)
# Save table
df_table = pd.DataFrame(table_rows)
df_table.to_csv(output_file, index=False)
print(f"Efficacy table saved to: {output_file}")
return df_table
def generate_safety_table(data, ae_columns, group_col='group', output_file='table3_safety.csv'):
"""
Generate adverse events table.
Parameters:
data: DataFrame with AE data
ae_columns: List of AE column names (each should have values 0-5 for CTCAE grades)
group_col: Grouping variable
output_file: Output CSV path
"""
groups = data[group_col].unique()
table_rows = []
# Header
header = {
'Adverse Event': 'Adverse Event',
**{f'{group}_any': f'Any Grade' for group in groups},
**{f'{group}_g34': f'Grade 3-4' for group in groups}
}
for ae in ae_columns:
if ae not in data.columns:
continue
row = {'Adverse Event': ae.replace('_', ' ').title()}
for group in groups:
group_data = data[data[group_col] == group][ae].dropna()
total = len(group_data)
# Any grade (Grade 1-5)
any_grade = (group_data > 0).sum()
any_pct = any_grade / total * 100 if total > 0 else 0
row[f'{group}_any'] = f"{any_grade} ({any_pct:.0f}%)"
# Grade 3-4
grade_34 = (group_data >= 3).sum()
g34_pct = grade_34 / total * 100 if total > 0 else 0
row[f'{group}_g34'] = f"{grade_34} ({g34_pct:.0f}%)"
table_rows.append(row)
# Save table
df_table = pd.DataFrame(table_rows)
df_table.to_csv(output_file, index=False)
print(f"Safety table saved to: {output_file}")
return df_table
def generate_latex_table(df, caption, label='table'):
"""
Convert DataFrame to LaTeX table code.
Returns:
String with LaTeX table code
"""
latex_code = "\\begin{table}[H]\n"
latex_code += "\\centering\n"
latex_code += "\\small\n"
latex_code += "\\begin{tabular}{" + "l" * len(df.columns) + "}\n"
latex_code += "\\toprule\n"
# Header
header_row = " & ".join([f"\\textbf{{{col}}}" for col in df.columns])
latex_code += header_row + " \\\\\n"
latex_code += "\\midrule\n"
# Data rows
for _, row in df.iterrows():
# Handle indentation for subcategories (lines starting with spaces)
first_col = str(row.iloc[0])
if first_col.startswith(' '):
first_col = '\\quad ' + first_col.strip()
data_row = [first_col] + [str(val) if pd.notna(val) else '' for val in row.iloc[1:]]
latex_code += " & ".join(data_row) + " \\\\\n"
latex_code += "\\bottomrule\n"
latex_code += "\\end{tabular}\n"
latex_code += f"\\caption{{{caption}}}\n"
latex_code += f"\\label{{tab:{label}}}\n"
latex_code += "\\end{table}\n"
return latex_code
def _binomial_ci(successes, trials, confidence=0.95):
"""
Calculate exact binomial confidence interval (Clopper-Pearson method).
Returns:
Lower and upper bounds as percentages
"""
if trials == 0:
return 0.0, 0.0
alpha = 1 - confidence
# Use beta distribution
from scipy.stats import beta
if successes == 0:
lower = 0.0
else:
lower = beta.ppf(alpha/2, successes, trials - successes + 1)
if successes == trials:
upper = 1.0
else:
upper = beta.ppf(1 - alpha/2, successes + 1, trials - successes)
return lower * 100, upper * 100
def create_example_data():
"""Create example dataset for testing."""
np.random.seed(42)
n = 100
data = pd.DataFrame({
'patient_id': [f'PT{i:03d}' for i in range(1, n+1)],
'group': np.random.choice(['Biomarker+', 'Biomarker-'], n),
'age': np.random.normal(62, 10, n),
'sex': np.random.choice(['Male', 'Female'], n),
'ecog_ps': np.random.choice(['0-1', '2'], n, p=[0.8, 0.2]),
'stage': np.random.choice(['III', 'IV'], n, p=[0.3, 0.7]),
'best_response': np.random.choice(['CR', 'PR', 'SD', 'PD'], n, p=[0.05, 0.35, 0.40, 0.20]),
'fatigue_grade': np.random.choice([0, 1, 2, 3], n, p=[0.3, 0.4, 0.2, 0.1]),
'nausea_grade': np.random.choice([0, 1, 2, 3], n, p=[0.4, 0.35, 0.20, 0.05]),
'neutropenia_grade': np.random.choice([0, 1, 2, 3, 4], n, p=[0.5, 0.2, 0.15, 0.10, 0.05]),
})
return data
def main():
parser = argparse.ArgumentParser(description='Generate clinical cohort tables')
parser.add_argument('input_file', type=str, nargs='?', default=None,
help='CSV file with cohort data (if not provided, uses example data)')
parser.add_argument('-o', '--output-dir', type=str, default='tables',
help='Output directory (default: tables)')
parser.add_argument('--group-col', type=str, default='group',
help='Column name for grouping variable')
parser.add_argument('--example', action='store_true',
help='Generate tables using example data')
args = parser.parse_args()
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Load or create data
if args.example or args.input_file is None:
print("Generating example dataset...")
data = create_example_data()
else:
print(f"Loading data from {args.input_file}...")
data = pd.read_csv(args.input_file)
print(f"Dataset: {len(data)} patients, {len(data[args.group_col].unique())} groups")
print(f"Groups: {data[args.group_col].value_counts().to_dict()}")
# Generate Table 1: Baseline characteristics
print("\nGenerating baseline characteristics table...")
baseline_table = generate_baseline_table(
data,
group_col=args.group_col,
output_file=output_dir / 'table1_baseline.csv'
)
# Generate LaTeX code for baseline table
latex_code = generate_latex_table(
baseline_table,
caption="Baseline patient demographics and clinical characteristics",
label="baseline"
)
with open(output_dir / 'table1_baseline.tex', 'w') as f:
f.write(latex_code)
print(f"LaTeX code saved to: {output_dir}/table1_baseline.tex")
# Generate Table 2: Efficacy outcomes
if 'best_response' in data.columns:
print("\nGenerating efficacy outcomes table...")
efficacy_table = generate_efficacy_table(
data,
group_col=args.group_col,
output_file=output_dir / 'table2_efficacy.csv'
)
latex_code = generate_latex_table(
efficacy_table,
caption="Treatment efficacy outcomes by group",
label="efficacy"
)
with open(output_dir / 'table2_efficacy.tex', 'w') as f:
f.write(latex_code)
# Generate Table 3: Safety (identify AE columns)
ae_columns = [col for col in data.columns if col.endswith('_grade')]
if ae_columns:
print("\nGenerating safety table...")
safety_table = generate_safety_table(
data,
ae_columns=ae_columns,
group_col=args.group_col,
output_file=output_dir / 'table3_safety.csv'
)
latex_code = generate_latex_table(
safety_table,
caption="Treatment-emergent adverse events by group (CTCAE v5.0)",
label="safety"
)
with open(output_dir / 'table3_safety.tex', 'w') as f:
f.write(latex_code)
print(f"\nAll tables generated successfully in {output_dir}/")
print("Files created:")
print(" - table1_baseline.csv / .tex")
print(" - table2_efficacy.csv / .tex (if response data available)")
print(" - table3_safety.csv / .tex (if AE data available)")
if __name__ == '__main__':
main()
# Example usage:
# python create_cohort_tables.py cohort_data.csv -o tables/
# python create_cohort_tables.py --example # Generate example tables
#
# Input CSV format:
# patient_id,group,age,sex,ecog_ps,stage,best_response,fatigue_grade,nausea_grade,...
# PT001,Biomarker+,65,Male,0-1,IV,PR,1,0,...
# PT002,Biomarker-,58,Female,0-1,III,SD,2,1,...
# ...

View File

@@ -0,0 +1,422 @@
#!/usr/bin/env python3
"""
Generate Kaplan-Meier Survival Curves for Clinical Decision Support Documents
This script creates publication-quality survival curves with:
- Kaplan-Meier survival estimates
- 95% confidence intervals
- Log-rank test statistics
- Hazard ratios with confidence intervals
- Number at risk tables
- Median survival annotations
Dependencies: lifelines, matplotlib, pandas, numpy
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test, multivariate_logrank_test
from lifelines import CoxPHFitter
import argparse
from pathlib import Path
def load_survival_data(filepath):
"""
Load survival data from CSV file.
Expected columns:
- patient_id: Unique patient identifier
- time: Survival time (months or days)
- event: Event indicator (1=event occurred, 0=censored)
- group: Stratification variable (e.g., 'Biomarker+', 'Biomarker-')
- Optional: Additional covariates for Cox regression
Returns:
pandas.DataFrame
"""
df = pd.read_csv(filepath)
# Validate required columns
required_cols = ['patient_id', 'time', 'event', 'group']
missing = [col for col in required_cols if col not in df.columns]
if missing:
raise ValueError(f"Missing required columns: {missing}")
# Convert event to boolean if needed
df['event'] = df['event'].astype(bool)
return df
def calculate_median_survival(kmf):
"""Calculate median survival with 95% CI."""
median = kmf.median_survival_time_
ci = kmf.confidence_interval_survival_function_
# Find time when survival crosses 0.5
if median == np.inf:
return None, None, None
# Get CI at median
idx = np.argmin(np.abs(kmf.survival_function_.index - median))
lower_ci = ci.iloc[idx]['KM_estimate_lower_0.95']
upper_ci = ci.iloc[idx]['KM_estimate_upper_0.95']
return median, lower_ci, upper_ci
def generate_kaplan_meier_plot(data, time_col='time', event_col='event',
group_col='group', output_path='survival_curve.pdf',
title='Kaplan-Meier Survival Curve',
xlabel='Time (months)', ylabel='Survival Probability'):
"""
Generate Kaplan-Meier survival curve comparing groups.
Parameters:
data: DataFrame with survival data
time_col: Column name for survival time
event_col: Column name for event indicator
group_col: Column name for stratification
output_path: Path to save figure
title: Plot title
xlabel: X-axis label (specify units)
ylabel: Y-axis label
"""
# Create figure and axis
fig, ax = plt.subplots(figsize=(10, 6))
# Get unique groups
groups = data[group_col].unique()
# Colors for groups (colorblind-friendly)
colors = ['#0173B2', '#DE8F05', '#029E73', '#CC78BC', '#CA9161']
kmf_models = {}
median_survivals = {}
# Plot each group
for i, group in enumerate(groups):
group_data = data[data[group_col] == group]
# Fit Kaplan-Meier
kmf = KaplanMeierFitter()
kmf.fit(group_data[time_col], group_data[event_col], label=str(group))
# Plot survival curve
kmf.plot_survival_function(ax=ax, ci_show=True, color=colors[i % len(colors)],
linewidth=2, alpha=0.8)
# Store model
kmf_models[group] = kmf
# Calculate median survival
median, lower, upper = calculate_median_survival(kmf)
median_survivals[group] = (median, lower, upper)
# Log-rank test
if len(groups) == 2:
group1_data = data[data[group_col] == groups[0]]
group2_data = data[data[group_col] == groups[1]]
results = logrank_test(
group1_data[time_col], group2_data[time_col],
group1_data[event_col], group2_data[event_col]
)
p_value = results.p_value
test_statistic = results.test_statistic
# Add log-rank test result to plot
ax.text(0.02, 0.15, f'Log-rank test:\np = {p_value:.4f}',
transform=ax.transAxes, fontsize=10,
verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
else:
# Multivariate log-rank for >2 groups
results = multivariate_logrank_test(data[time_col], data[group_col], data[event_col])
p_value = results.p_value
test_statistic = results.test_statistic
ax.text(0.02, 0.15, f'Log-rank test:\np = {p_value:.4f}\n({len(groups)} groups)',
transform=ax.transAxes, fontsize=10,
verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# Add median survival annotations
y_pos = 0.95
for group, (median, lower, upper) in median_survivals.items():
if median is not None:
ax.text(0.98, y_pos, f'{group}: {median:.1f} months (95% CI {lower:.1f}-{upper:.1f})',
transform=ax.transAxes, fontsize=9, ha='right',
verticalalignment='top')
else:
ax.text(0.98, y_pos, f'{group}: Not reached',
transform=ax.transAxes, fontsize=9, ha='right',
verticalalignment='top')
y_pos -= 0.05
# Formatting
ax.set_xlabel(xlabel, fontsize=12, fontweight='bold')
ax.set_ylabel(ylabel, fontsize=12, fontweight='bold')
ax.set_title(title, fontsize=14, fontweight='bold', pad=15)
ax.legend(loc='lower left', frameon=True, fontsize=10)
ax.grid(True, alpha=0.3, linestyle='--')
ax.set_ylim([0, 1.05])
plt.tight_layout()
# Save figure
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f"Survival curve saved to: {output_path}")
# Also save as PNG for easy viewing
png_path = Path(output_path).with_suffix('.png')
plt.savefig(png_path, dpi=300, bbox_inches='tight')
print(f"PNG version saved to: {png_path}")
plt.close()
return kmf_models, p_value
def generate_number_at_risk_table(data, time_col='time', event_col='event',
group_col='group', time_points=None):
"""
Generate number at risk table for survival analysis.
Parameters:
data: DataFrame with survival data
time_points: List of time points for risk table (if None, auto-generate)
Returns:
DataFrame with number at risk at each time point
"""
if time_points is None:
# Auto-generate time points (every 6 months up to max time)
max_time = data[time_col].max()
time_points = np.arange(0, max_time + 6, 6)
groups = data[group_col].unique()
risk_table = pd.DataFrame(index=time_points, columns=groups)
for group in groups:
group_data = data[data[group_col] == group]
for t in time_points:
# Number at risk = patients who haven't had event and haven't been censored before time t
at_risk = len(group_data[group_data[time_col] >= t])
risk_table.loc[t, group] = at_risk
return risk_table
def calculate_hazard_ratio(data, time_col='time', event_col='event', group_col='group',
reference_group=None):
"""
Calculate hazard ratio using Cox proportional hazards regression.
Parameters:
data: DataFrame
reference_group: Reference group for comparison (if None, uses first group)
Returns:
Hazard ratio, 95% CI, p-value
"""
# Encode group as binary for Cox regression
groups = data[group_col].unique()
if len(groups) != 2:
print("Warning: Cox HR calculation assumes 2 groups. Using first 2 groups.")
groups = groups[:2]
if reference_group is None:
reference_group = groups[0]
# Create binary indicator (1 for comparison group, 0 for reference)
data_cox = data.copy()
data_cox['group_binary'] = (data_cox[group_col] != reference_group).astype(int)
# Fit Cox model
cph = CoxPHFitter()
cph.fit(data_cox[[time_col, event_col, 'group_binary']],
duration_col=time_col, event_col=event_col)
# Extract results
hr = np.exp(cph.params_['group_binary'])
ci = np.exp(cph.confidence_intervals_.loc['group_binary'].values)
p_value = cph.summary.loc['group_binary', 'p']
return hr, ci[0], ci[1], p_value
def generate_report(data, output_dir, prefix='survival'):
"""
Generate comprehensive survival analysis report.
Creates:
- Kaplan-Meier curves (PDF and PNG)
- Number at risk table (CSV)
- Statistical summary (TXT)
- LaTeX table code (TEX)
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Generate survival curve
kmf_models, logrank_p = generate_kaplan_meier_plot(
data,
output_path=output_dir / f'{prefix}_kaplan_meier.pdf',
title='Survival Analysis by Group'
)
# Number at risk table
risk_table = generate_number_at_risk_table(data)
risk_table.to_csv(output_dir / f'{prefix}_number_at_risk.csv')
# Calculate hazard ratio
hr, ci_lower, ci_upper, hr_p = calculate_hazard_ratio(data)
# Generate statistical summary
with open(output_dir / f'{prefix}_statistics.txt', 'w') as f:
f.write("SURVIVAL ANALYSIS STATISTICAL SUMMARY\n")
f.write("=" * 60 + "\n\n")
groups = data['group'].unique()
for group in groups:
kmf = kmf_models[group]
median = kmf.median_survival_time_
# Calculate survival rates at common time points
try:
surv_12m = kmf.survival_function_at_times(12).values[0]
surv_24m = kmf.survival_function_at_times(24).values[0] if data['time'].max() >= 24 else None
except:
surv_12m = None
surv_24m = None
f.write(f"Group: {group}\n")
f.write(f" N = {len(data[data['group'] == group])}\n")
f.write(f" Events = {data[data['group'] == group]['event'].sum()}\n")
f.write(f" Median survival: {median:.1f} months\n" if median != np.inf else " Median survival: Not reached\n")
if surv_12m is not None:
f.write(f" 12-month survival rate: {surv_12m*100:.1f}%\n")
if surv_24m is not None:
f.write(f" 24-month survival rate: {surv_24m*100:.1f}%\n")
f.write("\n")
f.write(f"Log-Rank Test:\n")
f.write(f" p-value = {logrank_p:.4f}\n")
f.write(f" Interpretation: {'Significant' if logrank_p < 0.05 else 'Not significant'} difference in survival\n\n")
if len(groups) == 2:
f.write(f"Hazard Ratio ({groups[1]} vs {groups[0]}):\n")
f.write(f" HR = {hr:.2f} (95% CI {ci_lower:.2f}-{ci_upper:.2f})\n")
f.write(f" p-value = {hr_p:.4f}\n")
f.write(f" Interpretation: {groups[1]} has {((1-hr)*100):.0f}% {'reduction' if hr < 1 else 'increase'} in risk\n")
# Generate LaTeX table code
with open(output_dir / f'{prefix}_latex_table.tex', 'w') as f:
f.write("% LaTeX table code for survival outcomes\n")
f.write("\\begin{table}[H]\n")
f.write("\\centering\n")
f.write("\\small\n")
f.write("\\begin{tabular}{lcccc}\n")
f.write("\\toprule\n")
f.write("\\textbf{Endpoint} & \\textbf{Group A} & \\textbf{Group B} & \\textbf{HR (95\\% CI)} & \\textbf{p-value} \\\\\n")
f.write("\\midrule\n")
# Add median survival row
for i, group in enumerate(groups):
kmf = kmf_models[group]
median = kmf.median_survival_time_
if i == 0:
f.write(f"Median survival, months (95\\% CI) & ")
if median != np.inf:
f.write(f"{median:.1f} & ")
else:
f.write("NR & ")
else:
if median != np.inf:
f.write(f"{median:.1f} & ")
else:
f.write("NR & ")
f.write(f"{hr:.2f} ({ci_lower:.2f}-{ci_upper:.2f}) & {hr_p:.3f} \\\\\n")
# Add 12-month survival rate
f.write("12-month survival rate (\\%) & ")
for group in groups:
kmf = kmf_models[group]
try:
surv_12m = kmf.survival_function_at_times(12).values[0]
f.write(f"{surv_12m*100:.0f}\\% & ")
except:
f.write("-- & ")
f.write("-- & -- \\\\\n")
f.write("\\bottomrule\n")
f.write("\\end{tabular}\n")
f.write(f"\\caption{{Survival outcomes by group (log-rank p={logrank_p:.3f})}}\n")
f.write("\\end{table}\n")
print(f"\nAnalysis complete! Files saved to {output_dir}/")
print(f" - Survival curves: {prefix}_kaplan_meier.pdf/png")
print(f" - Statistics: {prefix}_statistics.txt")
print(f" - LaTeX table: {prefix}_latex_table.tex")
print(f" - Risk table: {prefix}_number_at_risk.csv")
def main():
parser = argparse.ArgumentParser(description='Generate Kaplan-Meier survival curves')
parser.add_argument('input_file', type=str, help='CSV file with survival data')
parser.add_argument('-o', '--output', type=str, default='survival_output',
help='Output directory (default: survival_output)')
parser.add_argument('-t', '--title', type=str, default='Kaplan-Meier Survival Curve',
help='Plot title')
parser.add_argument('-x', '--xlabel', type=str, default='Time (months)',
help='X-axis label')
parser.add_argument('-y', '--ylabel', type=str, default='Survival Probability',
help='Y-axis label')
parser.add_argument('--time-col', type=str, default='time',
help='Column name for time variable')
parser.add_argument('--event-col', type=str, default='event',
help='Column name for event indicator')
parser.add_argument('--group-col', type=str, default='group',
help='Column name for grouping variable')
args = parser.parse_args()
# Load data
print(f"Loading data from {args.input_file}...")
data = load_survival_data(args.input_file)
print(f"Loaded {len(data)} patients")
print(f"Groups: {data[args.group_col].value_counts().to_dict()}")
# Generate analysis
generate_report(
data,
output_dir=args.output,
prefix='survival'
)
if __name__ == '__main__':
main()
# Example usage:
# python generate_survival_analysis.py survival_data.csv -o figures/ -t "PFS by PD-L1 Status"
#
# Input CSV format:
# patient_id,time,event,group
# PT001,12.3,1,PD-L1+
# PT002,8.5,1,PD-L1-
# PT003,18.2,0,PD-L1+
# ...

View File

@@ -0,0 +1,335 @@
#!/usr/bin/env python3
"""
Validate Clinical Decision Support Documents for Quality and Completeness
Checks for:
- Evidence citations for all recommendations
- Statistical reporting completeness
- Biomarker nomenclature consistency
- Required sections present
- HIPAA de-identification
- GRADE recommendation format
Dependencies: None (pure Python)
"""
import re
import argparse
from pathlib import Path
from collections import defaultdict
class CDSValidator:
"""Validator for clinical decision support documents."""
def __init__(self, filepath):
self.filepath = filepath
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
self.content = f.read()
self.errors = []
self.warnings = []
self.info = []
def validate_all(self):
"""Run all validation checks."""
print(f"Validating: {self.filepath}")
print("="*70)
self.check_required_sections()
self.check_evidence_citations()
self.check_recommendation_grading()
self.check_statistical_reporting()
self.check_hipaa_identifiers()
self.check_biomarker_nomenclature()
return self.generate_report()
def check_required_sections(self):
"""Check if required sections are present."""
# Cohort analysis required sections
cohort_sections = [
'cohort characteristics',
'biomarker',
'outcomes',
'statistical analysis',
'clinical implications',
'references'
]
# Treatment recommendation required sections
rec_sections = [
'evidence',
'recommendation',
'monitoring',
'references'
]
content_lower = self.content.lower()
# Check which document type
is_cohort = 'cohort' in content_lower
is_recommendation = 'recommendation' in content_lower
if is_cohort:
missing = [sec for sec in cohort_sections if sec not in content_lower]
if missing:
self.warnings.append(f"Cohort analysis may be missing sections: {', '.join(missing)}")
else:
self.info.append("All cohort analysis sections present")
if is_recommendation:
missing = [sec for sec in rec_sections if sec not in content_lower]
if missing:
self.errors.append(f"Recommendation document missing required sections: {', '.join(missing)}")
else:
self.info.append("All recommendation sections present")
def check_evidence_citations(self):
"""Check that recommendations have citations."""
# Find recommendation statements
rec_pattern = r'(recommend|should|prefer|suggest|consider)(.*?)(?:\n\n|\Z)'
recommendations = re.findall(rec_pattern, self.content, re.IGNORECASE | re.DOTALL)
# Find citations
citation_patterns = [
r'\[\d+\]', # Numbered citations [1]
r'\(.*?\d{4}\)', # Author year (Smith 2020)
r'et al\.', # Et al citations
r'NCCN|ASCO|ESMO', # Guideline references
]
uncited_recommendations = []
for i, (_, rec_text) in enumerate(recommendations):
has_citation = any(re.search(pattern, rec_text) for pattern in citation_patterns)
if not has_citation:
snippet = rec_text[:60].strip() + '...'
uncited_recommendations.append(snippet)
if uncited_recommendations:
self.warnings.append(f"Found {len(uncited_recommendations)} recommendations without citations")
for rec in uncited_recommendations[:3]: # Show first 3
self.warnings.append(f" - {rec}")
else:
self.info.append(f"All {len(recommendations)} recommendations have citations")
def check_recommendation_grading(self):
"""Check for GRADE-style recommendation strength."""
# Look for GRADE notation (1A, 1B, 2A, 2B, 2C)
grade_pattern = r'GRADE\s*[12][A-C]|Grade\s*[12][A-C]|\(?\s*[12][A-C]\s*\)?'
grades = re.findall(grade_pattern, self.content, re.IGNORECASE)
# Look for strong/conditional language
strong_pattern = r'(strong|we recommend|should)'
conditional_pattern = r'(conditional|weak|we suggest|may consider|could consider)'
strong_count = len(re.findall(strong_pattern, self.content, re.IGNORECASE))
conditional_count = len(re.findall(conditional_pattern, self.content, re.IGNORECASE))
if grades:
self.info.append(f"Found {len(grades)} GRADE-style recommendations")
else:
self.warnings.append("No GRADE-style recommendation grading found (1A, 1B, 2A, etc.)")
if strong_count > 0 or conditional_count > 0:
self.info.append(f"Recommendation language: {strong_count} strong, {conditional_count} conditional")
else:
self.warnings.append("No clear recommendation strength language (strong/conditional) found")
def check_statistical_reporting(self):
"""Check for proper statistical reporting."""
# Check for p-values
p_values = re.findall(r'p\s*[=<>]\s*[\d.]+', self.content, re.IGNORECASE)
# Check for confidence intervals
ci_pattern = r'95%\s*CI|confidence interval'
cis = re.findall(ci_pattern, self.content, re.IGNORECASE)
# Check for hazard ratios
hr_pattern = r'HR\s*[=:]\s*[\d.]+'
hrs = re.findall(hr_pattern, self.content)
# Check for sample sizes
n_pattern = r'n\s*=\s*\d+'
sample_sizes = re.findall(n_pattern, self.content, re.IGNORECASE)
if not p_values:
self.warnings.append("No p-values found - statistical significance not reported")
else:
self.info.append(f"Found {len(p_values)} p-values")
if hrs and not cis:
self.warnings.append("Hazard ratios reported without confidence intervals")
if not sample_sizes:
self.warnings.append("Sample sizes (n=X) not clearly reported")
# Check for common statistical errors
if 'p=0.00' in self.content or 'p = 0.00' in self.content:
self.warnings.append("Found p=0.00 (should report as p<0.001 instead)")
def check_hipaa_identifiers(self):
"""Check for potential HIPAA identifiers."""
# 18 HIPAA identifiers (simplified check for common ones)
identifiers = {
'Names': r'Dr\.\s+[A-Z][a-z]+|Patient:\s*[A-Z][a-z]+',
'Specific dates': r'\d{1,2}/\d{1,2}/\d{4}', # MM/DD/YYYY
'Phone numbers': r'\d{3}[-.]?\d{3}[-.]?\d{4}',
'Email addresses': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
'SSN': r'\d{3}-\d{2}-\d{4}',
'MRN': r'MRN\s*:?\s*\d+',
}
found_identifiers = []
for identifier_type, pattern in identifiers.items():
matches = re.findall(pattern, self.content)
if matches:
found_identifiers.append(f"{identifier_type}: {len(matches)} instance(s)")
if found_identifiers:
self.errors.append("Potential HIPAA identifiers detected:")
for identifier in found_identifiers:
self.errors.append(f" - {identifier}")
self.errors.append(" ** Ensure proper de-identification before distribution **")
else:
self.info.append("No obvious HIPAA identifiers detected (basic check only)")
def check_biomarker_nomenclature(self):
"""Check for consistent biomarker nomenclature."""
# Common biomarker naming issues
issues = []
# Check for gene names (should be italicized in LaTeX)
gene_names = ['EGFR', 'ALK', 'ROS1', 'BRAF', 'KRAS', 'HER2', 'TP53', 'BRCA1', 'BRCA2']
for gene in gene_names:
# Check if gene appears but not in italics (\textit{} or \emph{})
if gene in self.content:
if f'\\textit{{{gene}}}' not in self.content and f'\\emph{{{gene}}}' not in self.content:
if '.tex' in self.filepath.suffix:
issues.append(f"{gene} should be italicized in LaTeX (\\textit{{{gene}}})")
# Check for protein vs gene naming
# HER2 (protein) vs ERBB2 (gene) - both valid
# Check for mutation nomenclature (HGVS format)
hgvs_pattern = r'p\.[A-Z]\d+[A-Z]' # e.g., p.L858R
hgvs_mutations = re.findall(hgvs_pattern, self.content)
if hgvs_mutations:
self.info.append(f"Found {len(hgvs_mutations)} HGVS protein nomenclature (e.g., p.L858R)")
# Warn about non-standard mutation format
if 'EGFR mutation' in self.content and 'exon' not in self.content.lower():
self.warnings.append("EGFR mutation mentioned - specify exon/variant (e.g., exon 19 deletion)")
if issues:
self.warnings.extend(issues)
def generate_report(self):
"""Generate validation report."""
print("\n" + "="*70)
print("VALIDATION REPORT")
print("="*70)
if self.errors:
print(f"\n❌ ERRORS ({len(self.errors)}):")
for error in self.errors:
print(f" {error}")
if self.warnings:
print(f"\n⚠️ WARNINGS ({len(self.warnings)}):")
for warning in self.warnings:
print(f" {warning}")
if self.info:
print(f"\n✓ PASSED CHECKS ({len(self.info)}):")
for info in self.info:
print(f" {info}")
# Overall status
print("\n" + "="*70)
if self.errors:
print("STATUS: ❌ VALIDATION FAILED - Address errors before distribution")
return False
elif self.warnings:
print("STATUS: ⚠️ VALIDATION PASSED WITH WARNINGS - Review recommended")
return True
else:
print("STATUS: ✓ VALIDATION PASSED - Document meets quality standards")
return True
def save_report(self, output_file):
"""Save validation report to file."""
with open(output_file, 'w') as f:
f.write("CLINICAL DECISION SUPPORT DOCUMENT VALIDATION REPORT\n")
f.write("="*70 + "\n")
f.write(f"Document: {self.filepath}\n")
f.write(f"Validated: {Path.cwd()}\n\n")
if self.errors:
f.write(f"ERRORS ({len(self.errors)}):\n")
for error in self.errors:
f.write(f" - {error}\n")
f.write("\n")
if self.warnings:
f.write(f"WARNINGS ({len(self.warnings)}):\n")
for warning in self.warnings:
f.write(f" - {warning}\n")
f.write("\n")
if self.info:
f.write(f"PASSED CHECKS ({len(self.info)}):\n")
for info in self.info:
f.write(f" - {info}\n")
print(f"\nValidation report saved to: {output_file}")
def main():
parser = argparse.ArgumentParser(description='Validate clinical decision support documents')
parser.add_argument('input_file', type=str, help='Document to validate (.tex, .md, .txt)')
parser.add_argument('-o', '--output', type=str, default=None,
help='Save validation report to file')
parser.add_argument('--strict', action='store_true',
help='Treat warnings as errors')
args = parser.parse_args()
# Validate
validator = CDSValidator(args.input_file)
passed = validator.validate_all()
# Save report if requested
if args.output:
validator.save_report(args.output)
# Exit code
if args.strict and (validator.errors or validator.warnings):
exit(1)
elif validator.errors:
exit(1)
else:
exit(0)
if __name__ == '__main__':
main()
# Example usage:
# python validate_cds_document.py cohort_analysis.tex
# python validate_cds_document.py treatment_recommendations.tex -o validation_report.txt
# python validate_cds_document.py document.tex --strict # Warnings cause failure