Initial commit

2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions
--- a/skills/clinical-decision-support/scripts/biomarker_classifier.py
+++ b/skills/clinical-decision-support/scripts/biomarker_classifier.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+"""
+Biomarker-Based Patient Stratification and Classification
+
+Performs patient stratification based on biomarker profiles with:
+- Binary classification (biomarker+/-)
+- Multi-class molecular subtypes
+- Continuous biomarker scoring
+- Correlation with clinical outcomes
+
+Dependencies: pandas, numpy, scipy, scikit-learn (optional for clustering)
+"""
+
+import pandas as pd
+import numpy as np
+from scipy import stats
+import argparse
+from pathlib import Path
+
+
+def classify_binary_biomarker(data, biomarker_col, threshold, 
+                              above_label='Biomarker+', below_label='Biomarker-'):
+    """
+    Binary classification based on biomarker threshold.
+    
+    Parameters:
+        data: DataFrame
+        biomarker_col: Column name for biomarker values
+        threshold: Cut-point value
+        above_label: Label for values >= threshold
+        below_label: Label for values < threshold
+    
+    Returns:
+        DataFrame with added 'biomarker_class' column
+    """
+    
+    data = data.copy()
+    data['biomarker_class'] = data[biomarker_col].apply(
+        lambda x: above_label if x >= threshold else below_label
+    )
+    
+    return data
+
+
+def classify_pd_l1_tps(data, pd_l1_col='pd_l1_tps'):
+    """
+    Classify PD-L1 Tumor Proportion Score into clinical categories.
+    
+    Categories:
+    - Negative: <1%
+    - Low: 1-49%
+    - High: >=50%
+    
+    Returns:
+        DataFrame with 'pd_l1_category' column
+    """
+    
+    data = data.copy()
+    
+    def categorize(tps):
+        if tps < 1:
+            return 'PD-L1 Negative (<1%)'
+        elif tps < 50:
+            return 'PD-L1 Low (1-49%)'
+        else:
+            return 'PD-L1 High (≥50%)'
+    
+    data['pd_l1_category'] = data[pd_l1_col].apply(categorize)
+    
+    # Distribution
+    print("\nPD-L1 TPS Distribution:")
+    print(data['pd_l1_category'].value_counts())
+    
+    return data
+
+
+def classify_her2_status(data, ihc_col='her2_ihc', fish_col='her2_fish'):
+    """
+    Classify HER2 status based on IHC and FISH results (ASCO/CAP guidelines).
+    
+    IHC Scores: 0, 1+, 2+, 3+
+    FISH: Positive, Negative (if IHC 2+)
+    
+    Classification:
+    - HER2-positive: IHC 3+ OR IHC 2+/FISH+
+    - HER2-negative: IHC 0/1+ OR IHC 2+/FISH-
+    - HER2-low: IHC 1+ or IHC 2+/FISH- (subset of HER2-negative)
+    
+    Returns:
+        DataFrame with 'her2_status' and 'her2_low' columns
+    """
+    
+    data = data.copy()
+    
+    def classify_her2(row):
+        ihc = row[ihc_col]
+        fish = row.get(fish_col, None)
+        
+        if ihc == '3+':
+            status = 'HER2-positive'
+            her2_low = False
+        elif ihc == '2+':
+            if fish == 'Positive':
+                status = 'HER2-positive'
+                her2_low = False
+            elif fish == 'Negative':
+                status = 'HER2-negative'
+                her2_low = True  # HER2-low
+            else:
+                status = 'HER2-equivocal (FISH needed)'
+                her2_low = False
+        elif ihc == '1+':
+            status = 'HER2-negative'
+            her2_low = True  # HER2-low
+        else:  # IHC 0
+            status = 'HER2-negative'
+            her2_low = False
+        
+        return pd.Series({'her2_status': status, 'her2_low': her2_low})
+    
+    data[['her2_status', 'her2_low']] = data.apply(classify_her2, axis=1)
+    
+    print("\nHER2 Status Distribution:")
+    print(data['her2_status'].value_counts())
+    print(f"\nHER2-low (IHC 1+ or 2+/FISH-): {data['her2_low'].sum()} patients")
+    
+    return data
+
+
+def classify_breast_cancer_subtype(data, er_col='er_positive', pr_col='pr_positive', 
+                                   her2_col='her2_positive'):
+    """
+    Classify breast cancer into molecular subtypes.
+    
+    Subtypes:
+    - HR+/HER2-: Luminal (ER+ and/or PR+, HER2-)
+    - HER2+: Any HER2-positive (regardless of HR status)
+    - Triple-negative: ER-, PR-, HER2-
+    
+    Returns:
+        DataFrame with 'bc_subtype' column
+    """
+    
+    data = data.copy()
+    
+    def get_subtype(row):
+        er = row[er_col]
+        pr = row[pr_col]
+        her2 = row[her2_col]
+        
+        if her2:
+            if er or pr:
+                return 'HR+/HER2+ (Luminal B HER2+)'
+            else:
+                return 'HR-/HER2+ (HER2-enriched)'
+        elif er or pr:
+            return 'HR+/HER2- (Luminal)'
+        else:
+            return 'Triple-Negative'
+    
+    data['bc_subtype'] = data.apply(get_subtype, axis=1)
+    
+    print("\nBreast Cancer Subtype Distribution:")
+    print(data['bc_subtype'].value_counts())
+    
+    return data
+
+
+def correlate_biomarker_outcome(data, biomarker_col, outcome_col, biomarker_type='binary'):
+    """
+    Assess correlation between biomarker and clinical outcome.
+    
+    Parameters:
+        biomarker_col: Biomarker variable
+        outcome_col: Outcome variable  
+        biomarker_type: 'binary', 'categorical', 'continuous'
+    
+    Returns:
+        Statistical test results
+    """
+    
+    print(f"\nCorrelation Analysis: {biomarker_col} vs {outcome_col}")
+    print("="*60)
+    
+    # Remove missing data
+    analysis_data = data[[biomarker_col, outcome_col]].dropna()
+    
+    if biomarker_type == 'binary' or biomarker_type == 'categorical':
+        # Cross-tabulation
+        contingency = pd.crosstab(analysis_data[biomarker_col], analysis_data[outcome_col])
+        print("\nContingency Table:")
+        print(contingency)
+        
+        # Chi-square test
+        chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
+        
+        print(f"\nChi-square test:")
+        print(f"  χ² = {chi2:.2f}, df = {dof}, p = {p_value:.4f}")
+        
+        # Odds ratio if 2x2 table
+        if contingency.shape == (2, 2):
+            a, b = contingency.iloc[0, :]
+            c, d = contingency.iloc[1, :]
+            or_value = (a * d) / (b * c) if b * c > 0 else np.inf
+            
+            # Confidence interval for OR (log method)
+            log_or = np.log(or_value)
+            se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
+            ci_lower = np.exp(log_or - 1.96 * se_log_or)
+            ci_upper = np.exp(log_or + 1.96 * se_log_or)
+            
+            print(f"\nOdds Ratio: {or_value:.2f} (95% CI {ci_lower:.2f}-{ci_upper:.2f})")
+    
+    elif biomarker_type == 'continuous':
+        # Correlation coefficient
+        r, p_value = stats.pearsonr(analysis_data[biomarker_col], analysis_data[outcome_col])
+        
+        print(f"\nPearson correlation:")
+        print(f"  r = {r:.3f}, p = {p_value:.4f}")
+        
+        # Also report Spearman for robustness
+        rho, p_spearman = stats.spearmanr(analysis_data[biomarker_col], analysis_data[outcome_col])
+        print(f"Spearman correlation:")
+        print(f"  ρ = {rho:.3f}, p = {p_spearman:.4f}")
+    
+    return p_value
+
+
+def stratify_cohort_report(data, stratification_var, output_dir='stratification_report'):
+    """
+    Generate comprehensive stratification report.
+    
+    Parameters:
+        data: DataFrame with patient data
+        stratification_var: Column name for stratification
+        output_dir: Output directory for reports
+    """
+    
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    print(f"\nCOHORT STRATIFICATION REPORT")
+    print("="*60)
+    print(f"Stratification Variable: {stratification_var}")
+    print(f"Total Patients: {len(data)}")
+    
+    # Group distribution
+    distribution = data[stratification_var].value_counts()
+    print(f"\nGroup Distribution:")
+    for group, count in distribution.items():
+        pct = count / len(data) * 100
+        print(f"  {group}: {count} ({pct:.1f}%)")
+    
+    # Save distribution
+    distribution.to_csv(output_dir / 'group_distribution.csv')
+    
+    # Compare baseline characteristics across groups
+    print(f"\nBaseline Characteristics by {stratification_var}:")
+    
+    results = []
+    
+    # Continuous variables
+    continuous_vars = data.select_dtypes(include=[np.number]).columns.tolist()
+    continuous_vars = [v for v in continuous_vars if v != stratification_var]
+    
+    for var in continuous_vars[:5]:  # Limit to first 5 for demo
+        print(f"\n{var}:")
+        for group in distribution.index:
+            group_data = data[data[stratification_var] == group][var].dropna()
+            print(f"  {group}: median {group_data.median():.1f} [IQR {group_data.quantile(0.25):.1f}-{group_data.quantile(0.75):.1f}]")
+        
+        # Statistical test
+        if len(distribution) == 2:
+            groups_list = distribution.index.tolist()
+            g1 = data[data[stratification_var] == groups_list[0]][var].dropna()
+            g2 = data[data[stratification_var] == groups_list[1]][var].dropna()
+            _, p_value = stats.mannwhitneyu(g1, g2, alternative='two-sided')
+            print(f"  p-value: {p_value:.4f}")
+            
+            results.append({
+                'Variable': var,
+                'Test': 'Mann-Whitney U',
+                'p_value': p_value,
+                'Significant': 'Yes' if p_value < 0.05 else 'No'
+            })
+    
+    # Save results
+    if results:
+        df_results = pd.DataFrame(results)
+        df_results.to_csv(output_dir / 'statistical_comparisons.csv', index=False)
+        print(f"\nStatistical comparison results saved to: {output_dir}/statistical_comparisons.csv")
+    
+    print(f"\nStratification report complete! Files saved to {output_dir}/")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Biomarker-based patient classification')
+    parser.add_argument('input_file', type=str, nargs='?', default=None,
+                       help='CSV file with patient and biomarker data')
+    parser.add_argument('-b', '--biomarker', type=str, default=None,
+                       help='Biomarker column name for stratification')
+    parser.add_argument('-t', '--threshold', type=float, default=None,
+                       help='Threshold for binary classification')
+    parser.add_argument('-o', '--output-dir', type=str, default='stratification',
+                       help='Output directory')
+    parser.add_argument('--example', action='store_true',
+                       help='Run with example data')
+    
+    args = parser.parse_args()
+    
+    # Example data if requested
+    if args.example or args.input_file is None:
+        print("Generating example dataset...")
+        np.random.seed(42)
+        n = 80
+        
+        data = pd.DataFrame({
+            'patient_id': [f'PT{i:03d}' for i in range(1, n+1)],
+            'age': np.random.normal(62, 10, n),
+            'sex': np.random.choice(['Male', 'Female'], n),
+            'pd_l1_tps': np.random.exponential(20, n),  # Exponential distribution for PD-L1
+            'tmb': np.random.exponential(8, n),  # Mutations per Mb
+            'her2_ihc': np.random.choice(['0', '1+', '2+', '3+'], n, p=[0.6, 0.2, 0.15, 0.05]),
+            'response': np.random.choice(['Yes', 'No'], n, p=[0.4, 0.6]),
+        })
+        
+        # Simulate correlation: higher PD-L1 -> better response
+        data.loc[data['pd_l1_tps'] >= 50, 'response'] = np.random.choice(['Yes', 'No'], 
+                                                                         (data['pd_l1_tps'] >= 50).sum(),
+                                                                         p=[0.65, 0.35])
+    else:
+        print(f"Loading data from {args.input_file}...")
+        data = pd.read_csv(args.input_file)
+    
+    print(f"Dataset: {len(data)} patients")
+    print(f"Columns: {list(data.columns)}")
+    
+    # PD-L1 classification example
+    if 'pd_l1_tps' in data.columns or args.biomarker == 'pd_l1_tps':
+        data = classify_pd_l1_tps(data, 'pd_l1_tps')
+        
+        # Correlate with response if available
+        if 'response' in data.columns:
+            correlate_biomarker_outcome(data, 'pd_l1_category', 'response', biomarker_type='categorical')
+    
+    # HER2 classification if columns present
+    if 'her2_ihc' in data.columns:
+        if 'her2_fish' not in data.columns:
+            # Add placeholder FISH for IHC 2+
+            data['her2_fish'] = np.nan
+        data = classify_her2_status(data, 'her2_ihc', 'her2_fish')
+    
+    # Generic binary classification if threshold provided
+    if args.biomarker and args.threshold is not None:
+        print(f"\nBinary classification: {args.biomarker} with threshold {args.threshold}")
+        data = classify_binary_biomarker(data, args.biomarker, args.threshold)
+        print(data['biomarker_class'].value_counts())
+    
+    # Generate stratification report
+    if args.biomarker:
+        stratify_cohort_report(data, args.biomarker, output_dir=args.output_dir)
+    elif 'pd_l1_category' in data.columns:
+        stratify_cohort_report(data, 'pd_l1_category', output_dir=args.output_dir)
+    
+    # Save classified data
+    output_path = Path(args.output_dir) / 'classified_data.csv'
+    data.to_csv(output_path, index=False)
+    print(f"\nClassified data saved to: {output_path}")
+
+
+if __name__ == '__main__':
+    main()
+
+
+# Example usage:
+# python biomarker_classifier.py data.csv -b pd_l1_tps -t 50 -o classification/
+# python biomarker_classifier.py --example
+#
+# Input CSV format:
+# patient_id,pd_l1_tps,tmb,her2_ihc,response,pfs_months,event
+# PT001,55.5,12.3,1+,Yes,14.2,1
+# PT002,8.2,5.1,0,No,6.5,1
+# ...
+