#!/usr/bin/env python3 """ Biomarker-Based Patient Stratification and Classification Performs patient stratification based on biomarker profiles with: - Binary classification (biomarker+/-) - Multi-class molecular subtypes - Continuous biomarker scoring - Correlation with clinical outcomes Dependencies: pandas, numpy, scipy, scikit-learn (optional for clustering) """ import pandas as pd import numpy as np from scipy import stats import argparse from pathlib import Path def classify_binary_biomarker(data, biomarker_col, threshold, above_label='Biomarker+', below_label='Biomarker-'): """ Binary classification based on biomarker threshold. Parameters: data: DataFrame biomarker_col: Column name for biomarker values threshold: Cut-point value above_label: Label for values >= threshold below_label: Label for values < threshold Returns: DataFrame with added 'biomarker_class' column """ data = data.copy() data['biomarker_class'] = data[biomarker_col].apply( lambda x: above_label if x >= threshold else below_label ) return data def classify_pd_l1_tps(data, pd_l1_col='pd_l1_tps'): """ Classify PD-L1 Tumor Proportion Score into clinical categories. Categories: - Negative: <1% - Low: 1-49% - High: >=50% Returns: DataFrame with 'pd_l1_category' column """ data = data.copy() def categorize(tps): if tps < 1: return 'PD-L1 Negative (<1%)' elif tps < 50: return 'PD-L1 Low (1-49%)' else: return 'PD-L1 High (≥50%)' data['pd_l1_category'] = data[pd_l1_col].apply(categorize) # Distribution print("\nPD-L1 TPS Distribution:") print(data['pd_l1_category'].value_counts()) return data def classify_her2_status(data, ihc_col='her2_ihc', fish_col='her2_fish'): """ Classify HER2 status based on IHC and FISH results (ASCO/CAP guidelines). IHC Scores: 0, 1+, 2+, 3+ FISH: Positive, Negative (if IHC 2+) Classification: - HER2-positive: IHC 3+ OR IHC 2+/FISH+ - HER2-negative: IHC 0/1+ OR IHC 2+/FISH- - HER2-low: IHC 1+ or IHC 2+/FISH- (subset of HER2-negative) Returns: DataFrame with 'her2_status' and 'her2_low' columns """ data = data.copy() def classify_her2(row): ihc = row[ihc_col] fish = row.get(fish_col, None) if ihc == '3+': status = 'HER2-positive' her2_low = False elif ihc == '2+': if fish == 'Positive': status = 'HER2-positive' her2_low = False elif fish == 'Negative': status = 'HER2-negative' her2_low = True # HER2-low else: status = 'HER2-equivocal (FISH needed)' her2_low = False elif ihc == '1+': status = 'HER2-negative' her2_low = True # HER2-low else: # IHC 0 status = 'HER2-negative' her2_low = False return pd.Series({'her2_status': status, 'her2_low': her2_low}) data[['her2_status', 'her2_low']] = data.apply(classify_her2, axis=1) print("\nHER2 Status Distribution:") print(data['her2_status'].value_counts()) print(f"\nHER2-low (IHC 1+ or 2+/FISH-): {data['her2_low'].sum()} patients") return data def classify_breast_cancer_subtype(data, er_col='er_positive', pr_col='pr_positive', her2_col='her2_positive'): """ Classify breast cancer into molecular subtypes. Subtypes: - HR+/HER2-: Luminal (ER+ and/or PR+, HER2-) - HER2+: Any HER2-positive (regardless of HR status) - Triple-negative: ER-, PR-, HER2- Returns: DataFrame with 'bc_subtype' column """ data = data.copy() def get_subtype(row): er = row[er_col] pr = row[pr_col] her2 = row[her2_col] if her2: if er or pr: return 'HR+/HER2+ (Luminal B HER2+)' else: return 'HR-/HER2+ (HER2-enriched)' elif er or pr: return 'HR+/HER2- (Luminal)' else: return 'Triple-Negative' data['bc_subtype'] = data.apply(get_subtype, axis=1) print("\nBreast Cancer Subtype Distribution:") print(data['bc_subtype'].value_counts()) return data def correlate_biomarker_outcome(data, biomarker_col, outcome_col, biomarker_type='binary'): """ Assess correlation between biomarker and clinical outcome. Parameters: biomarker_col: Biomarker variable outcome_col: Outcome variable biomarker_type: 'binary', 'categorical', 'continuous' Returns: Statistical test results """ print(f"\nCorrelation Analysis: {biomarker_col} vs {outcome_col}") print("="*60) # Remove missing data analysis_data = data[[biomarker_col, outcome_col]].dropna() if biomarker_type == 'binary' or biomarker_type == 'categorical': # Cross-tabulation contingency = pd.crosstab(analysis_data[biomarker_col], analysis_data[outcome_col]) print("\nContingency Table:") print(contingency) # Chi-square test chi2, p_value, dof, expected = stats.chi2_contingency(contingency) print(f"\nChi-square test:") print(f" χ² = {chi2:.2f}, df = {dof}, p = {p_value:.4f}") # Odds ratio if 2x2 table if contingency.shape == (2, 2): a, b = contingency.iloc[0, :] c, d = contingency.iloc[1, :] or_value = (a * d) / (b * c) if b * c > 0 else np.inf # Confidence interval for OR (log method) log_or = np.log(or_value) se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d) ci_lower = np.exp(log_or - 1.96 * se_log_or) ci_upper = np.exp(log_or + 1.96 * se_log_or) print(f"\nOdds Ratio: {or_value:.2f} (95% CI {ci_lower:.2f}-{ci_upper:.2f})") elif biomarker_type == 'continuous': # Correlation coefficient r, p_value = stats.pearsonr(analysis_data[biomarker_col], analysis_data[outcome_col]) print(f"\nPearson correlation:") print(f" r = {r:.3f}, p = {p_value:.4f}") # Also report Spearman for robustness rho, p_spearman = stats.spearmanr(analysis_data[biomarker_col], analysis_data[outcome_col]) print(f"Spearman correlation:") print(f" ρ = {rho:.3f}, p = {p_spearman:.4f}") return p_value def stratify_cohort_report(data, stratification_var, output_dir='stratification_report'): """ Generate comprehensive stratification report. Parameters: data: DataFrame with patient data stratification_var: Column name for stratification output_dir: Output directory for reports """ output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) print(f"\nCOHORT STRATIFICATION REPORT") print("="*60) print(f"Stratification Variable: {stratification_var}") print(f"Total Patients: {len(data)}") # Group distribution distribution = data[stratification_var].value_counts() print(f"\nGroup Distribution:") for group, count in distribution.items(): pct = count / len(data) * 100 print(f" {group}: {count} ({pct:.1f}%)") # Save distribution distribution.to_csv(output_dir / 'group_distribution.csv') # Compare baseline characteristics across groups print(f"\nBaseline Characteristics by {stratification_var}:") results = [] # Continuous variables continuous_vars = data.select_dtypes(include=[np.number]).columns.tolist() continuous_vars = [v for v in continuous_vars if v != stratification_var] for var in continuous_vars[:5]: # Limit to first 5 for demo print(f"\n{var}:") for group in distribution.index: group_data = data[data[stratification_var] == group][var].dropna() print(f" {group}: median {group_data.median():.1f} [IQR {group_data.quantile(0.25):.1f}-{group_data.quantile(0.75):.1f}]") # Statistical test if len(distribution) == 2: groups_list = distribution.index.tolist() g1 = data[data[stratification_var] == groups_list[0]][var].dropna() g2 = data[data[stratification_var] == groups_list[1]][var].dropna() _, p_value = stats.mannwhitneyu(g1, g2, alternative='two-sided') print(f" p-value: {p_value:.4f}") results.append({ 'Variable': var, 'Test': 'Mann-Whitney U', 'p_value': p_value, 'Significant': 'Yes' if p_value < 0.05 else 'No' }) # Save results if results: df_results = pd.DataFrame(results) df_results.to_csv(output_dir / 'statistical_comparisons.csv', index=False) print(f"\nStatistical comparison results saved to: {output_dir}/statistical_comparisons.csv") print(f"\nStratification report complete! Files saved to {output_dir}/") def main(): parser = argparse.ArgumentParser(description='Biomarker-based patient classification') parser.add_argument('input_file', type=str, nargs='?', default=None, help='CSV file with patient and biomarker data') parser.add_argument('-b', '--biomarker', type=str, default=None, help='Biomarker column name for stratification') parser.add_argument('-t', '--threshold', type=float, default=None, help='Threshold for binary classification') parser.add_argument('-o', '--output-dir', type=str, default='stratification', help='Output directory') parser.add_argument('--example', action='store_true', help='Run with example data') args = parser.parse_args() # Example data if requested if args.example or args.input_file is None: print("Generating example dataset...") np.random.seed(42) n = 80 data = pd.DataFrame({ 'patient_id': [f'PT{i:03d}' for i in range(1, n+1)], 'age': np.random.normal(62, 10, n), 'sex': np.random.choice(['Male', 'Female'], n), 'pd_l1_tps': np.random.exponential(20, n), # Exponential distribution for PD-L1 'tmb': np.random.exponential(8, n), # Mutations per Mb 'her2_ihc': np.random.choice(['0', '1+', '2+', '3+'], n, p=[0.6, 0.2, 0.15, 0.05]), 'response': np.random.choice(['Yes', 'No'], n, p=[0.4, 0.6]), }) # Simulate correlation: higher PD-L1 -> better response data.loc[data['pd_l1_tps'] >= 50, 'response'] = np.random.choice(['Yes', 'No'], (data['pd_l1_tps'] >= 50).sum(), p=[0.65, 0.35]) else: print(f"Loading data from {args.input_file}...") data = pd.read_csv(args.input_file) print(f"Dataset: {len(data)} patients") print(f"Columns: {list(data.columns)}") # PD-L1 classification example if 'pd_l1_tps' in data.columns or args.biomarker == 'pd_l1_tps': data = classify_pd_l1_tps(data, 'pd_l1_tps') # Correlate with response if available if 'response' in data.columns: correlate_biomarker_outcome(data, 'pd_l1_category', 'response', biomarker_type='categorical') # HER2 classification if columns present if 'her2_ihc' in data.columns: if 'her2_fish' not in data.columns: # Add placeholder FISH for IHC 2+ data['her2_fish'] = np.nan data = classify_her2_status(data, 'her2_ihc', 'her2_fish') # Generic binary classification if threshold provided if args.biomarker and args.threshold is not None: print(f"\nBinary classification: {args.biomarker} with threshold {args.threshold}") data = classify_binary_biomarker(data, args.biomarker, args.threshold) print(data['biomarker_class'].value_counts()) # Generate stratification report if args.biomarker: stratify_cohort_report(data, args.biomarker, output_dir=args.output_dir) elif 'pd_l1_category' in data.columns: stratify_cohort_report(data, 'pd_l1_category', output_dir=args.output_dir) # Save classified data output_path = Path(args.output_dir) / 'classified_data.csv' data.to_csv(output_path, index=False) print(f"\nClassified data saved to: {output_path}") if __name__ == '__main__': main() # Example usage: # python biomarker_classifier.py data.csv -b pd_l1_tps -t 50 -o classification/ # python biomarker_classifier.py --example # # Input CSV format: # patient_id,pd_l1_tps,tmb,her2_ihc,response,pfs_months,event # PT001,55.5,12.3,1+,Yes,14.2,1 # PT002,8.2,5.1,0,No,6.5,1 # ...