385 lines
13 KiB
Python
Executable File
385 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Biomarker-Based Patient Stratification and Classification
|
||
|
||
Performs patient stratification based on biomarker profiles with:
|
||
- Binary classification (biomarker+/-)
|
||
- Multi-class molecular subtypes
|
||
- Continuous biomarker scoring
|
||
- Correlation with clinical outcomes
|
||
|
||
Dependencies: pandas, numpy, scipy, scikit-learn (optional for clustering)
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from scipy import stats
|
||
import argparse
|
||
from pathlib import Path
|
||
|
||
|
||
def classify_binary_biomarker(data, biomarker_col, threshold,
|
||
above_label='Biomarker+', below_label='Biomarker-'):
|
||
"""
|
||
Binary classification based on biomarker threshold.
|
||
|
||
Parameters:
|
||
data: DataFrame
|
||
biomarker_col: Column name for biomarker values
|
||
threshold: Cut-point value
|
||
above_label: Label for values >= threshold
|
||
below_label: Label for values < threshold
|
||
|
||
Returns:
|
||
DataFrame with added 'biomarker_class' column
|
||
"""
|
||
|
||
data = data.copy()
|
||
data['biomarker_class'] = data[biomarker_col].apply(
|
||
lambda x: above_label if x >= threshold else below_label
|
||
)
|
||
|
||
return data
|
||
|
||
|
||
def classify_pd_l1_tps(data, pd_l1_col='pd_l1_tps'):
|
||
"""
|
||
Classify PD-L1 Tumor Proportion Score into clinical categories.
|
||
|
||
Categories:
|
||
- Negative: <1%
|
||
- Low: 1-49%
|
||
- High: >=50%
|
||
|
||
Returns:
|
||
DataFrame with 'pd_l1_category' column
|
||
"""
|
||
|
||
data = data.copy()
|
||
|
||
def categorize(tps):
|
||
if tps < 1:
|
||
return 'PD-L1 Negative (<1%)'
|
||
elif tps < 50:
|
||
return 'PD-L1 Low (1-49%)'
|
||
else:
|
||
return 'PD-L1 High (≥50%)'
|
||
|
||
data['pd_l1_category'] = data[pd_l1_col].apply(categorize)
|
||
|
||
# Distribution
|
||
print("\nPD-L1 TPS Distribution:")
|
||
print(data['pd_l1_category'].value_counts())
|
||
|
||
return data
|
||
|
||
|
||
def classify_her2_status(data, ihc_col='her2_ihc', fish_col='her2_fish'):
|
||
"""
|
||
Classify HER2 status based on IHC and FISH results (ASCO/CAP guidelines).
|
||
|
||
IHC Scores: 0, 1+, 2+, 3+
|
||
FISH: Positive, Negative (if IHC 2+)
|
||
|
||
Classification:
|
||
- HER2-positive: IHC 3+ OR IHC 2+/FISH+
|
||
- HER2-negative: IHC 0/1+ OR IHC 2+/FISH-
|
||
- HER2-low: IHC 1+ or IHC 2+/FISH- (subset of HER2-negative)
|
||
|
||
Returns:
|
||
DataFrame with 'her2_status' and 'her2_low' columns
|
||
"""
|
||
|
||
data = data.copy()
|
||
|
||
def classify_her2(row):
|
||
ihc = row[ihc_col]
|
||
fish = row.get(fish_col, None)
|
||
|
||
if ihc == '3+':
|
||
status = 'HER2-positive'
|
||
her2_low = False
|
||
elif ihc == '2+':
|
||
if fish == 'Positive':
|
||
status = 'HER2-positive'
|
||
her2_low = False
|
||
elif fish == 'Negative':
|
||
status = 'HER2-negative'
|
||
her2_low = True # HER2-low
|
||
else:
|
||
status = 'HER2-equivocal (FISH needed)'
|
||
her2_low = False
|
||
elif ihc == '1+':
|
||
status = 'HER2-negative'
|
||
her2_low = True # HER2-low
|
||
else: # IHC 0
|
||
status = 'HER2-negative'
|
||
her2_low = False
|
||
|
||
return pd.Series({'her2_status': status, 'her2_low': her2_low})
|
||
|
||
data[['her2_status', 'her2_low']] = data.apply(classify_her2, axis=1)
|
||
|
||
print("\nHER2 Status Distribution:")
|
||
print(data['her2_status'].value_counts())
|
||
print(f"\nHER2-low (IHC 1+ or 2+/FISH-): {data['her2_low'].sum()} patients")
|
||
|
||
return data
|
||
|
||
|
||
def classify_breast_cancer_subtype(data, er_col='er_positive', pr_col='pr_positive',
|
||
her2_col='her2_positive'):
|
||
"""
|
||
Classify breast cancer into molecular subtypes.
|
||
|
||
Subtypes:
|
||
- HR+/HER2-: Luminal (ER+ and/or PR+, HER2-)
|
||
- HER2+: Any HER2-positive (regardless of HR status)
|
||
- Triple-negative: ER-, PR-, HER2-
|
||
|
||
Returns:
|
||
DataFrame with 'bc_subtype' column
|
||
"""
|
||
|
||
data = data.copy()
|
||
|
||
def get_subtype(row):
|
||
er = row[er_col]
|
||
pr = row[pr_col]
|
||
her2 = row[her2_col]
|
||
|
||
if her2:
|
||
if er or pr:
|
||
return 'HR+/HER2+ (Luminal B HER2+)'
|
||
else:
|
||
return 'HR-/HER2+ (HER2-enriched)'
|
||
elif er or pr:
|
||
return 'HR+/HER2- (Luminal)'
|
||
else:
|
||
return 'Triple-Negative'
|
||
|
||
data['bc_subtype'] = data.apply(get_subtype, axis=1)
|
||
|
||
print("\nBreast Cancer Subtype Distribution:")
|
||
print(data['bc_subtype'].value_counts())
|
||
|
||
return data
|
||
|
||
|
||
def correlate_biomarker_outcome(data, biomarker_col, outcome_col, biomarker_type='binary'):
|
||
"""
|
||
Assess correlation between biomarker and clinical outcome.
|
||
|
||
Parameters:
|
||
biomarker_col: Biomarker variable
|
||
outcome_col: Outcome variable
|
||
biomarker_type: 'binary', 'categorical', 'continuous'
|
||
|
||
Returns:
|
||
Statistical test results
|
||
"""
|
||
|
||
print(f"\nCorrelation Analysis: {biomarker_col} vs {outcome_col}")
|
||
print("="*60)
|
||
|
||
# Remove missing data
|
||
analysis_data = data[[biomarker_col, outcome_col]].dropna()
|
||
|
||
if biomarker_type == 'binary' or biomarker_type == 'categorical':
|
||
# Cross-tabulation
|
||
contingency = pd.crosstab(analysis_data[biomarker_col], analysis_data[outcome_col])
|
||
print("\nContingency Table:")
|
||
print(contingency)
|
||
|
||
# Chi-square test
|
||
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
|
||
|
||
print(f"\nChi-square test:")
|
||
print(f" χ² = {chi2:.2f}, df = {dof}, p = {p_value:.4f}")
|
||
|
||
# Odds ratio if 2x2 table
|
||
if contingency.shape == (2, 2):
|
||
a, b = contingency.iloc[0, :]
|
||
c, d = contingency.iloc[1, :]
|
||
or_value = (a * d) / (b * c) if b * c > 0 else np.inf
|
||
|
||
# Confidence interval for OR (log method)
|
||
log_or = np.log(or_value)
|
||
se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
|
||
ci_lower = np.exp(log_or - 1.96 * se_log_or)
|
||
ci_upper = np.exp(log_or + 1.96 * se_log_or)
|
||
|
||
print(f"\nOdds Ratio: {or_value:.2f} (95% CI {ci_lower:.2f}-{ci_upper:.2f})")
|
||
|
||
elif biomarker_type == 'continuous':
|
||
# Correlation coefficient
|
||
r, p_value = stats.pearsonr(analysis_data[biomarker_col], analysis_data[outcome_col])
|
||
|
||
print(f"\nPearson correlation:")
|
||
print(f" r = {r:.3f}, p = {p_value:.4f}")
|
||
|
||
# Also report Spearman for robustness
|
||
rho, p_spearman = stats.spearmanr(analysis_data[biomarker_col], analysis_data[outcome_col])
|
||
print(f"Spearman correlation:")
|
||
print(f" ρ = {rho:.3f}, p = {p_spearman:.4f}")
|
||
|
||
return p_value
|
||
|
||
|
||
def stratify_cohort_report(data, stratification_var, output_dir='stratification_report'):
|
||
"""
|
||
Generate comprehensive stratification report.
|
||
|
||
Parameters:
|
||
data: DataFrame with patient data
|
||
stratification_var: Column name for stratification
|
||
output_dir: Output directory for reports
|
||
"""
|
||
|
||
output_dir = Path(output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
print(f"\nCOHORT STRATIFICATION REPORT")
|
||
print("="*60)
|
||
print(f"Stratification Variable: {stratification_var}")
|
||
print(f"Total Patients: {len(data)}")
|
||
|
||
# Group distribution
|
||
distribution = data[stratification_var].value_counts()
|
||
print(f"\nGroup Distribution:")
|
||
for group, count in distribution.items():
|
||
pct = count / len(data) * 100
|
||
print(f" {group}: {count} ({pct:.1f}%)")
|
||
|
||
# Save distribution
|
||
distribution.to_csv(output_dir / 'group_distribution.csv')
|
||
|
||
# Compare baseline characteristics across groups
|
||
print(f"\nBaseline Characteristics by {stratification_var}:")
|
||
|
||
results = []
|
||
|
||
# Continuous variables
|
||
continuous_vars = data.select_dtypes(include=[np.number]).columns.tolist()
|
||
continuous_vars = [v for v in continuous_vars if v != stratification_var]
|
||
|
||
for var in continuous_vars[:5]: # Limit to first 5 for demo
|
||
print(f"\n{var}:")
|
||
for group in distribution.index:
|
||
group_data = data[data[stratification_var] == group][var].dropna()
|
||
print(f" {group}: median {group_data.median():.1f} [IQR {group_data.quantile(0.25):.1f}-{group_data.quantile(0.75):.1f}]")
|
||
|
||
# Statistical test
|
||
if len(distribution) == 2:
|
||
groups_list = distribution.index.tolist()
|
||
g1 = data[data[stratification_var] == groups_list[0]][var].dropna()
|
||
g2 = data[data[stratification_var] == groups_list[1]][var].dropna()
|
||
_, p_value = stats.mannwhitneyu(g1, g2, alternative='two-sided')
|
||
print(f" p-value: {p_value:.4f}")
|
||
|
||
results.append({
|
||
'Variable': var,
|
||
'Test': 'Mann-Whitney U',
|
||
'p_value': p_value,
|
||
'Significant': 'Yes' if p_value < 0.05 else 'No'
|
||
})
|
||
|
||
# Save results
|
||
if results:
|
||
df_results = pd.DataFrame(results)
|
||
df_results.to_csv(output_dir / 'statistical_comparisons.csv', index=False)
|
||
print(f"\nStatistical comparison results saved to: {output_dir}/statistical_comparisons.csv")
|
||
|
||
print(f"\nStratification report complete! Files saved to {output_dir}/")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description='Biomarker-based patient classification')
|
||
parser.add_argument('input_file', type=str, nargs='?', default=None,
|
||
help='CSV file with patient and biomarker data')
|
||
parser.add_argument('-b', '--biomarker', type=str, default=None,
|
||
help='Biomarker column name for stratification')
|
||
parser.add_argument('-t', '--threshold', type=float, default=None,
|
||
help='Threshold for binary classification')
|
||
parser.add_argument('-o', '--output-dir', type=str, default='stratification',
|
||
help='Output directory')
|
||
parser.add_argument('--example', action='store_true',
|
||
help='Run with example data')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Example data if requested
|
||
if args.example or args.input_file is None:
|
||
print("Generating example dataset...")
|
||
np.random.seed(42)
|
||
n = 80
|
||
|
||
data = pd.DataFrame({
|
||
'patient_id': [f'PT{i:03d}' for i in range(1, n+1)],
|
||
'age': np.random.normal(62, 10, n),
|
||
'sex': np.random.choice(['Male', 'Female'], n),
|
||
'pd_l1_tps': np.random.exponential(20, n), # Exponential distribution for PD-L1
|
||
'tmb': np.random.exponential(8, n), # Mutations per Mb
|
||
'her2_ihc': np.random.choice(['0', '1+', '2+', '3+'], n, p=[0.6, 0.2, 0.15, 0.05]),
|
||
'response': np.random.choice(['Yes', 'No'], n, p=[0.4, 0.6]),
|
||
})
|
||
|
||
# Simulate correlation: higher PD-L1 -> better response
|
||
data.loc[data['pd_l1_tps'] >= 50, 'response'] = np.random.choice(['Yes', 'No'],
|
||
(data['pd_l1_tps'] >= 50).sum(),
|
||
p=[0.65, 0.35])
|
||
else:
|
||
print(f"Loading data from {args.input_file}...")
|
||
data = pd.read_csv(args.input_file)
|
||
|
||
print(f"Dataset: {len(data)} patients")
|
||
print(f"Columns: {list(data.columns)}")
|
||
|
||
# PD-L1 classification example
|
||
if 'pd_l1_tps' in data.columns or args.biomarker == 'pd_l1_tps':
|
||
data = classify_pd_l1_tps(data, 'pd_l1_tps')
|
||
|
||
# Correlate with response if available
|
||
if 'response' in data.columns:
|
||
correlate_biomarker_outcome(data, 'pd_l1_category', 'response', biomarker_type='categorical')
|
||
|
||
# HER2 classification if columns present
|
||
if 'her2_ihc' in data.columns:
|
||
if 'her2_fish' not in data.columns:
|
||
# Add placeholder FISH for IHC 2+
|
||
data['her2_fish'] = np.nan
|
||
data = classify_her2_status(data, 'her2_ihc', 'her2_fish')
|
||
|
||
# Generic binary classification if threshold provided
|
||
if args.biomarker and args.threshold is not None:
|
||
print(f"\nBinary classification: {args.biomarker} with threshold {args.threshold}")
|
||
data = classify_binary_biomarker(data, args.biomarker, args.threshold)
|
||
print(data['biomarker_class'].value_counts())
|
||
|
||
# Generate stratification report
|
||
if args.biomarker:
|
||
stratify_cohort_report(data, args.biomarker, output_dir=args.output_dir)
|
||
elif 'pd_l1_category' in data.columns:
|
||
stratify_cohort_report(data, 'pd_l1_category', output_dir=args.output_dir)
|
||
|
||
# Save classified data
|
||
output_path = Path(args.output_dir) / 'classified_data.csv'
|
||
data.to_csv(output_path, index=False)
|
||
print(f"\nClassified data saved to: {output_path}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
|
||
|
||
# Example usage:
|
||
# python biomarker_classifier.py data.csv -b pd_l1_tps -t 50 -o classification/
|
||
# python biomarker_classifier.py --example
|
||
#
|
||
# Input CSV format:
|
||
# patient_id,pd_l1_tps,tmb,her2_ihc,response,pfs_months,event
|
||
# PT001,55.5,12.3,1+,Yes,14.2,1
|
||
# PT002,8.2,5.1,0,No,6.5,1
|
||
# ...
|
||
|