Files
gh-k-dense-ai-claude-scient…/skills/clinical-decision-support/scripts/biomarker_classifier.py
2025-11-30 08:30:18 +08:00

385 lines
13 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Biomarker-Based Patient Stratification and Classification
Performs patient stratification based on biomarker profiles with:
- Binary classification (biomarker+/-)
- Multi-class molecular subtypes
- Continuous biomarker scoring
- Correlation with clinical outcomes
Dependencies: pandas, numpy, scipy, scikit-learn (optional for clustering)
"""
import pandas as pd
import numpy as np
from scipy import stats
import argparse
from pathlib import Path
def classify_binary_biomarker(data, biomarker_col, threshold,
above_label='Biomarker+', below_label='Biomarker-'):
"""
Binary classification based on biomarker threshold.
Parameters:
data: DataFrame
biomarker_col: Column name for biomarker values
threshold: Cut-point value
above_label: Label for values >= threshold
below_label: Label for values < threshold
Returns:
DataFrame with added 'biomarker_class' column
"""
data = data.copy()
data['biomarker_class'] = data[biomarker_col].apply(
lambda x: above_label if x >= threshold else below_label
)
return data
def classify_pd_l1_tps(data, pd_l1_col='pd_l1_tps'):
"""
Classify PD-L1 Tumor Proportion Score into clinical categories.
Categories:
- Negative: <1%
- Low: 1-49%
- High: >=50%
Returns:
DataFrame with 'pd_l1_category' column
"""
data = data.copy()
def categorize(tps):
if tps < 1:
return 'PD-L1 Negative (<1%)'
elif tps < 50:
return 'PD-L1 Low (1-49%)'
else:
return 'PD-L1 High (≥50%)'
data['pd_l1_category'] = data[pd_l1_col].apply(categorize)
# Distribution
print("\nPD-L1 TPS Distribution:")
print(data['pd_l1_category'].value_counts())
return data
def classify_her2_status(data, ihc_col='her2_ihc', fish_col='her2_fish'):
"""
Classify HER2 status based on IHC and FISH results (ASCO/CAP guidelines).
IHC Scores: 0, 1+, 2+, 3+
FISH: Positive, Negative (if IHC 2+)
Classification:
- HER2-positive: IHC 3+ OR IHC 2+/FISH+
- HER2-negative: IHC 0/1+ OR IHC 2+/FISH-
- HER2-low: IHC 1+ or IHC 2+/FISH- (subset of HER2-negative)
Returns:
DataFrame with 'her2_status' and 'her2_low' columns
"""
data = data.copy()
def classify_her2(row):
ihc = row[ihc_col]
fish = row.get(fish_col, None)
if ihc == '3+':
status = 'HER2-positive'
her2_low = False
elif ihc == '2+':
if fish == 'Positive':
status = 'HER2-positive'
her2_low = False
elif fish == 'Negative':
status = 'HER2-negative'
her2_low = True # HER2-low
else:
status = 'HER2-equivocal (FISH needed)'
her2_low = False
elif ihc == '1+':
status = 'HER2-negative'
her2_low = True # HER2-low
else: # IHC 0
status = 'HER2-negative'
her2_low = False
return pd.Series({'her2_status': status, 'her2_low': her2_low})
data[['her2_status', 'her2_low']] = data.apply(classify_her2, axis=1)
print("\nHER2 Status Distribution:")
print(data['her2_status'].value_counts())
print(f"\nHER2-low (IHC 1+ or 2+/FISH-): {data['her2_low'].sum()} patients")
return data
def classify_breast_cancer_subtype(data, er_col='er_positive', pr_col='pr_positive',
her2_col='her2_positive'):
"""
Classify breast cancer into molecular subtypes.
Subtypes:
- HR+/HER2-: Luminal (ER+ and/or PR+, HER2-)
- HER2+: Any HER2-positive (regardless of HR status)
- Triple-negative: ER-, PR-, HER2-
Returns:
DataFrame with 'bc_subtype' column
"""
data = data.copy()
def get_subtype(row):
er = row[er_col]
pr = row[pr_col]
her2 = row[her2_col]
if her2:
if er or pr:
return 'HR+/HER2+ (Luminal B HER2+)'
else:
return 'HR-/HER2+ (HER2-enriched)'
elif er or pr:
return 'HR+/HER2- (Luminal)'
else:
return 'Triple-Negative'
data['bc_subtype'] = data.apply(get_subtype, axis=1)
print("\nBreast Cancer Subtype Distribution:")
print(data['bc_subtype'].value_counts())
return data
def correlate_biomarker_outcome(data, biomarker_col, outcome_col, biomarker_type='binary'):
"""
Assess correlation between biomarker and clinical outcome.
Parameters:
biomarker_col: Biomarker variable
outcome_col: Outcome variable
biomarker_type: 'binary', 'categorical', 'continuous'
Returns:
Statistical test results
"""
print(f"\nCorrelation Analysis: {biomarker_col} vs {outcome_col}")
print("="*60)
# Remove missing data
analysis_data = data[[biomarker_col, outcome_col]].dropna()
if biomarker_type == 'binary' or biomarker_type == 'categorical':
# Cross-tabulation
contingency = pd.crosstab(analysis_data[biomarker_col], analysis_data[outcome_col])
print("\nContingency Table:")
print(contingency)
# Chi-square test
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
print(f"\nChi-square test:")
print(f" χ² = {chi2:.2f}, df = {dof}, p = {p_value:.4f}")
# Odds ratio if 2x2 table
if contingency.shape == (2, 2):
a, b = contingency.iloc[0, :]
c, d = contingency.iloc[1, :]
or_value = (a * d) / (b * c) if b * c > 0 else np.inf
# Confidence interval for OR (log method)
log_or = np.log(or_value)
se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
ci_lower = np.exp(log_or - 1.96 * se_log_or)
ci_upper = np.exp(log_or + 1.96 * se_log_or)
print(f"\nOdds Ratio: {or_value:.2f} (95% CI {ci_lower:.2f}-{ci_upper:.2f})")
elif biomarker_type == 'continuous':
# Correlation coefficient
r, p_value = stats.pearsonr(analysis_data[biomarker_col], analysis_data[outcome_col])
print(f"\nPearson correlation:")
print(f" r = {r:.3f}, p = {p_value:.4f}")
# Also report Spearman for robustness
rho, p_spearman = stats.spearmanr(analysis_data[biomarker_col], analysis_data[outcome_col])
print(f"Spearman correlation:")
print(f" ρ = {rho:.3f}, p = {p_spearman:.4f}")
return p_value
def stratify_cohort_report(data, stratification_var, output_dir='stratification_report'):
"""
Generate comprehensive stratification report.
Parameters:
data: DataFrame with patient data
stratification_var: Column name for stratification
output_dir: Output directory for reports
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"\nCOHORT STRATIFICATION REPORT")
print("="*60)
print(f"Stratification Variable: {stratification_var}")
print(f"Total Patients: {len(data)}")
# Group distribution
distribution = data[stratification_var].value_counts()
print(f"\nGroup Distribution:")
for group, count in distribution.items():
pct = count / len(data) * 100
print(f" {group}: {count} ({pct:.1f}%)")
# Save distribution
distribution.to_csv(output_dir / 'group_distribution.csv')
# Compare baseline characteristics across groups
print(f"\nBaseline Characteristics by {stratification_var}:")
results = []
# Continuous variables
continuous_vars = data.select_dtypes(include=[np.number]).columns.tolist()
continuous_vars = [v for v in continuous_vars if v != stratification_var]
for var in continuous_vars[:5]: # Limit to first 5 for demo
print(f"\n{var}:")
for group in distribution.index:
group_data = data[data[stratification_var] == group][var].dropna()
print(f" {group}: median {group_data.median():.1f} [IQR {group_data.quantile(0.25):.1f}-{group_data.quantile(0.75):.1f}]")
# Statistical test
if len(distribution) == 2:
groups_list = distribution.index.tolist()
g1 = data[data[stratification_var] == groups_list[0]][var].dropna()
g2 = data[data[stratification_var] == groups_list[1]][var].dropna()
_, p_value = stats.mannwhitneyu(g1, g2, alternative='two-sided')
print(f" p-value: {p_value:.4f}")
results.append({
'Variable': var,
'Test': 'Mann-Whitney U',
'p_value': p_value,
'Significant': 'Yes' if p_value < 0.05 else 'No'
})
# Save results
if results:
df_results = pd.DataFrame(results)
df_results.to_csv(output_dir / 'statistical_comparisons.csv', index=False)
print(f"\nStatistical comparison results saved to: {output_dir}/statistical_comparisons.csv")
print(f"\nStratification report complete! Files saved to {output_dir}/")
def main():
parser = argparse.ArgumentParser(description='Biomarker-based patient classification')
parser.add_argument('input_file', type=str, nargs='?', default=None,
help='CSV file with patient and biomarker data')
parser.add_argument('-b', '--biomarker', type=str, default=None,
help='Biomarker column name for stratification')
parser.add_argument('-t', '--threshold', type=float, default=None,
help='Threshold for binary classification')
parser.add_argument('-o', '--output-dir', type=str, default='stratification',
help='Output directory')
parser.add_argument('--example', action='store_true',
help='Run with example data')
args = parser.parse_args()
# Example data if requested
if args.example or args.input_file is None:
print("Generating example dataset...")
np.random.seed(42)
n = 80
data = pd.DataFrame({
'patient_id': [f'PT{i:03d}' for i in range(1, n+1)],
'age': np.random.normal(62, 10, n),
'sex': np.random.choice(['Male', 'Female'], n),
'pd_l1_tps': np.random.exponential(20, n), # Exponential distribution for PD-L1
'tmb': np.random.exponential(8, n), # Mutations per Mb
'her2_ihc': np.random.choice(['0', '1+', '2+', '3+'], n, p=[0.6, 0.2, 0.15, 0.05]),
'response': np.random.choice(['Yes', 'No'], n, p=[0.4, 0.6]),
})
# Simulate correlation: higher PD-L1 -> better response
data.loc[data['pd_l1_tps'] >= 50, 'response'] = np.random.choice(['Yes', 'No'],
(data['pd_l1_tps'] >= 50).sum(),
p=[0.65, 0.35])
else:
print(f"Loading data from {args.input_file}...")
data = pd.read_csv(args.input_file)
print(f"Dataset: {len(data)} patients")
print(f"Columns: {list(data.columns)}")
# PD-L1 classification example
if 'pd_l1_tps' in data.columns or args.biomarker == 'pd_l1_tps':
data = classify_pd_l1_tps(data, 'pd_l1_tps')
# Correlate with response if available
if 'response' in data.columns:
correlate_biomarker_outcome(data, 'pd_l1_category', 'response', biomarker_type='categorical')
# HER2 classification if columns present
if 'her2_ihc' in data.columns:
if 'her2_fish' not in data.columns:
# Add placeholder FISH for IHC 2+
data['her2_fish'] = np.nan
data = classify_her2_status(data, 'her2_ihc', 'her2_fish')
# Generic binary classification if threshold provided
if args.biomarker and args.threshold is not None:
print(f"\nBinary classification: {args.biomarker} with threshold {args.threshold}")
data = classify_binary_biomarker(data, args.biomarker, args.threshold)
print(data['biomarker_class'].value_counts())
# Generate stratification report
if args.biomarker:
stratify_cohort_report(data, args.biomarker, output_dir=args.output_dir)
elif 'pd_l1_category' in data.columns:
stratify_cohort_report(data, 'pd_l1_category', output_dir=args.output_dir)
# Save classified data
output_path = Path(args.output_dir) / 'classified_data.csv'
data.to_csv(output_path, index=False)
print(f"\nClassified data saved to: {output_path}")
if __name__ == '__main__':
main()
# Example usage:
# python biomarker_classifier.py data.csv -b pd_l1_tps -t 50 -o classification/
# python biomarker_classifier.py --example
#
# Input CSV format:
# patient_id,pd_l1_tps,tmb,her2_ihc,response,pfs_months,event
# PT001,55.5,12.3,1+,Yes,14.2,1
# PT002,8.2,5.1,0,No,6.5,1
# ...