Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions

View File

@@ -0,0 +1,384 @@
#!/usr/bin/env python3
"""
Biomarker-Based Patient Stratification and Classification
Performs patient stratification based on biomarker profiles with:
- Binary classification (biomarker+/-)
- Multi-class molecular subtypes
- Continuous biomarker scoring
- Correlation with clinical outcomes
Dependencies: pandas, numpy, scipy, scikit-learn (optional for clustering)
"""
import pandas as pd
import numpy as np
from scipy import stats
import argparse
from pathlib import Path
def classify_binary_biomarker(data, biomarker_col, threshold,
above_label='Biomarker+', below_label='Biomarker-'):
"""
Binary classification based on biomarker threshold.
Parameters:
data: DataFrame
biomarker_col: Column name for biomarker values
threshold: Cut-point value
above_label: Label for values >= threshold
below_label: Label for values < threshold
Returns:
DataFrame with added 'biomarker_class' column
"""
data = data.copy()
data['biomarker_class'] = data[biomarker_col].apply(
lambda x: above_label if x >= threshold else below_label
)
return data
def classify_pd_l1_tps(data, pd_l1_col='pd_l1_tps'):
"""
Classify PD-L1 Tumor Proportion Score into clinical categories.
Categories:
- Negative: <1%
- Low: 1-49%
- High: >=50%
Returns:
DataFrame with 'pd_l1_category' column
"""
data = data.copy()
def categorize(tps):
if tps < 1:
return 'PD-L1 Negative (<1%)'
elif tps < 50:
return 'PD-L1 Low (1-49%)'
else:
return 'PD-L1 High (≥50%)'
data['pd_l1_category'] = data[pd_l1_col].apply(categorize)
# Distribution
print("\nPD-L1 TPS Distribution:")
print(data['pd_l1_category'].value_counts())
return data
def classify_her2_status(data, ihc_col='her2_ihc', fish_col='her2_fish'):
"""
Classify HER2 status based on IHC and FISH results (ASCO/CAP guidelines).
IHC Scores: 0, 1+, 2+, 3+
FISH: Positive, Negative (if IHC 2+)
Classification:
- HER2-positive: IHC 3+ OR IHC 2+/FISH+
- HER2-negative: IHC 0/1+ OR IHC 2+/FISH-
- HER2-low: IHC 1+ or IHC 2+/FISH- (subset of HER2-negative)
Returns:
DataFrame with 'her2_status' and 'her2_low' columns
"""
data = data.copy()
def classify_her2(row):
ihc = row[ihc_col]
fish = row.get(fish_col, None)
if ihc == '3+':
status = 'HER2-positive'
her2_low = False
elif ihc == '2+':
if fish == 'Positive':
status = 'HER2-positive'
her2_low = False
elif fish == 'Negative':
status = 'HER2-negative'
her2_low = True # HER2-low
else:
status = 'HER2-equivocal (FISH needed)'
her2_low = False
elif ihc == '1+':
status = 'HER2-negative'
her2_low = True # HER2-low
else: # IHC 0
status = 'HER2-negative'
her2_low = False
return pd.Series({'her2_status': status, 'her2_low': her2_low})
data[['her2_status', 'her2_low']] = data.apply(classify_her2, axis=1)
print("\nHER2 Status Distribution:")
print(data['her2_status'].value_counts())
print(f"\nHER2-low (IHC 1+ or 2+/FISH-): {data['her2_low'].sum()} patients")
return data
def classify_breast_cancer_subtype(data, er_col='er_positive', pr_col='pr_positive',
her2_col='her2_positive'):
"""
Classify breast cancer into molecular subtypes.
Subtypes:
- HR+/HER2-: Luminal (ER+ and/or PR+, HER2-)
- HER2+: Any HER2-positive (regardless of HR status)
- Triple-negative: ER-, PR-, HER2-
Returns:
DataFrame with 'bc_subtype' column
"""
data = data.copy()
def get_subtype(row):
er = row[er_col]
pr = row[pr_col]
her2 = row[her2_col]
if her2:
if er or pr:
return 'HR+/HER2+ (Luminal B HER2+)'
else:
return 'HR-/HER2+ (HER2-enriched)'
elif er or pr:
return 'HR+/HER2- (Luminal)'
else:
return 'Triple-Negative'
data['bc_subtype'] = data.apply(get_subtype, axis=1)
print("\nBreast Cancer Subtype Distribution:")
print(data['bc_subtype'].value_counts())
return data
def correlate_biomarker_outcome(data, biomarker_col, outcome_col, biomarker_type='binary'):
"""
Assess correlation between biomarker and clinical outcome.
Parameters:
biomarker_col: Biomarker variable
outcome_col: Outcome variable
biomarker_type: 'binary', 'categorical', 'continuous'
Returns:
Statistical test results
"""
print(f"\nCorrelation Analysis: {biomarker_col} vs {outcome_col}")
print("="*60)
# Remove missing data
analysis_data = data[[biomarker_col, outcome_col]].dropna()
if biomarker_type == 'binary' or biomarker_type == 'categorical':
# Cross-tabulation
contingency = pd.crosstab(analysis_data[biomarker_col], analysis_data[outcome_col])
print("\nContingency Table:")
print(contingency)
# Chi-square test
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
print(f"\nChi-square test:")
print(f" χ² = {chi2:.2f}, df = {dof}, p = {p_value:.4f}")
# Odds ratio if 2x2 table
if contingency.shape == (2, 2):
a, b = contingency.iloc[0, :]
c, d = contingency.iloc[1, :]
or_value = (a * d) / (b * c) if b * c > 0 else np.inf
# Confidence interval for OR (log method)
log_or = np.log(or_value)
se_log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
ci_lower = np.exp(log_or - 1.96 * se_log_or)
ci_upper = np.exp(log_or + 1.96 * se_log_or)
print(f"\nOdds Ratio: {or_value:.2f} (95% CI {ci_lower:.2f}-{ci_upper:.2f})")
elif biomarker_type == 'continuous':
# Correlation coefficient
r, p_value = stats.pearsonr(analysis_data[biomarker_col], analysis_data[outcome_col])
print(f"\nPearson correlation:")
print(f" r = {r:.3f}, p = {p_value:.4f}")
# Also report Spearman for robustness
rho, p_spearman = stats.spearmanr(analysis_data[biomarker_col], analysis_data[outcome_col])
print(f"Spearman correlation:")
print(f" ρ = {rho:.3f}, p = {p_spearman:.4f}")
return p_value
def stratify_cohort_report(data, stratification_var, output_dir='stratification_report'):
"""
Generate comprehensive stratification report.
Parameters:
data: DataFrame with patient data
stratification_var: Column name for stratification
output_dir: Output directory for reports
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"\nCOHORT STRATIFICATION REPORT")
print("="*60)
print(f"Stratification Variable: {stratification_var}")
print(f"Total Patients: {len(data)}")
# Group distribution
distribution = data[stratification_var].value_counts()
print(f"\nGroup Distribution:")
for group, count in distribution.items():
pct = count / len(data) * 100
print(f" {group}: {count} ({pct:.1f}%)")
# Save distribution
distribution.to_csv(output_dir / 'group_distribution.csv')
# Compare baseline characteristics across groups
print(f"\nBaseline Characteristics by {stratification_var}:")
results = []
# Continuous variables
continuous_vars = data.select_dtypes(include=[np.number]).columns.tolist()
continuous_vars = [v for v in continuous_vars if v != stratification_var]
for var in continuous_vars[:5]: # Limit to first 5 for demo
print(f"\n{var}:")
for group in distribution.index:
group_data = data[data[stratification_var] == group][var].dropna()
print(f" {group}: median {group_data.median():.1f} [IQR {group_data.quantile(0.25):.1f}-{group_data.quantile(0.75):.1f}]")
# Statistical test
if len(distribution) == 2:
groups_list = distribution.index.tolist()
g1 = data[data[stratification_var] == groups_list[0]][var].dropna()
g2 = data[data[stratification_var] == groups_list[1]][var].dropna()
_, p_value = stats.mannwhitneyu(g1, g2, alternative='two-sided')
print(f" p-value: {p_value:.4f}")
results.append({
'Variable': var,
'Test': 'Mann-Whitney U',
'p_value': p_value,
'Significant': 'Yes' if p_value < 0.05 else 'No'
})
# Save results
if results:
df_results = pd.DataFrame(results)
df_results.to_csv(output_dir / 'statistical_comparisons.csv', index=False)
print(f"\nStatistical comparison results saved to: {output_dir}/statistical_comparisons.csv")
print(f"\nStratification report complete! Files saved to {output_dir}/")
def main():
parser = argparse.ArgumentParser(description='Biomarker-based patient classification')
parser.add_argument('input_file', type=str, nargs='?', default=None,
help='CSV file with patient and biomarker data')
parser.add_argument('-b', '--biomarker', type=str, default=None,
help='Biomarker column name for stratification')
parser.add_argument('-t', '--threshold', type=float, default=None,
help='Threshold for binary classification')
parser.add_argument('-o', '--output-dir', type=str, default='stratification',
help='Output directory')
parser.add_argument('--example', action='store_true',
help='Run with example data')
args = parser.parse_args()
# Example data if requested
if args.example or args.input_file is None:
print("Generating example dataset...")
np.random.seed(42)
n = 80
data = pd.DataFrame({
'patient_id': [f'PT{i:03d}' for i in range(1, n+1)],
'age': np.random.normal(62, 10, n),
'sex': np.random.choice(['Male', 'Female'], n),
'pd_l1_tps': np.random.exponential(20, n), # Exponential distribution for PD-L1
'tmb': np.random.exponential(8, n), # Mutations per Mb
'her2_ihc': np.random.choice(['0', '1+', '2+', '3+'], n, p=[0.6, 0.2, 0.15, 0.05]),
'response': np.random.choice(['Yes', 'No'], n, p=[0.4, 0.6]),
})
# Simulate correlation: higher PD-L1 -> better response
data.loc[data['pd_l1_tps'] >= 50, 'response'] = np.random.choice(['Yes', 'No'],
(data['pd_l1_tps'] >= 50).sum(),
p=[0.65, 0.35])
else:
print(f"Loading data from {args.input_file}...")
data = pd.read_csv(args.input_file)
print(f"Dataset: {len(data)} patients")
print(f"Columns: {list(data.columns)}")
# PD-L1 classification example
if 'pd_l1_tps' in data.columns or args.biomarker == 'pd_l1_tps':
data = classify_pd_l1_tps(data, 'pd_l1_tps')
# Correlate with response if available
if 'response' in data.columns:
correlate_biomarker_outcome(data, 'pd_l1_category', 'response', biomarker_type='categorical')
# HER2 classification if columns present
if 'her2_ihc' in data.columns:
if 'her2_fish' not in data.columns:
# Add placeholder FISH for IHC 2+
data['her2_fish'] = np.nan
data = classify_her2_status(data, 'her2_ihc', 'her2_fish')
# Generic binary classification if threshold provided
if args.biomarker and args.threshold is not None:
print(f"\nBinary classification: {args.biomarker} with threshold {args.threshold}")
data = classify_binary_biomarker(data, args.biomarker, args.threshold)
print(data['biomarker_class'].value_counts())
# Generate stratification report
if args.biomarker:
stratify_cohort_report(data, args.biomarker, output_dir=args.output_dir)
elif 'pd_l1_category' in data.columns:
stratify_cohort_report(data, 'pd_l1_category', output_dir=args.output_dir)
# Save classified data
output_path = Path(args.output_dir) / 'classified_data.csv'
data.to_csv(output_path, index=False)
print(f"\nClassified data saved to: {output_path}")
if __name__ == '__main__':
main()
# Example usage:
# python biomarker_classifier.py data.csv -b pd_l1_tps -t 50 -o classification/
# python biomarker_classifier.py --example
#
# Input CSV format:
# patient_id,pd_l1_tps,tmb,her2_ihc,response,pfs_months,event
# PT001,55.5,12.3,1+,Yes,14.2,1
# PT002,8.2,5.1,0,No,6.5,1
# ...