Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions

View File

@@ -0,0 +1,524 @@
#!/usr/bin/env python3
"""
Generate Clinical Cohort Tables for Baseline Characteristics and Outcomes
Creates publication-ready tables with:
- Baseline demographics (Table 1 style)
- Efficacy outcomes
- Safety/adverse events
- Statistical comparisons between groups
Dependencies: pandas, numpy, scipy
"""
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path
import argparse
def calculate_p_value(data, variable, group_col='group', var_type='categorical'):
"""
Calculate appropriate p-value for group comparison.
Parameters:
data: DataFrame
variable: Column name to compare
group_col: Grouping variable
var_type: 'categorical', 'continuous_normal', 'continuous_nonnormal'
Returns:
p-value (float)
"""
groups = data[group_col].unique()
if len(groups) != 2:
return np.nan # Only handle 2-group comparisons
group1_data = data[data[group_col] == groups[0]][variable].dropna()
group2_data = data[data[group_col] == groups[1]][variable].dropna()
if var_type == 'categorical':
# Chi-square or Fisher's exact test
contingency = pd.crosstab(data[variable], data[group_col])
# Check if Fisher's exact is needed (expected count < 5)
if contingency.min().min() < 5:
# Fisher's exact (2x2 only)
if contingency.shape == (2, 2):
_, p_value = stats.fisher_exact(contingency)
else:
# Use chi-square but note limitation
_, p_value, _, _ = stats.chi2_contingency(contingency)
else:
_, p_value, _, _ = stats.chi2_contingency(contingency)
elif var_type == 'continuous_normal':
# Independent t-test
_, p_value = stats.ttest_ind(group1_data, group2_data, equal_var=False)
elif var_type == 'continuous_nonnormal':
# Mann-Whitney U test
_, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative='two-sided')
else:
raise ValueError("var_type must be 'categorical', 'continuous_normal', or 'continuous_nonnormal'")
return p_value
def format_continuous_variable(data, variable, group_col, distribution='normal'):
"""
Format continuous variable for table display.
Returns:
Dictionary with formatted strings for each group and p-value
"""
groups = data[group_col].unique()
results = {}
for group in groups:
group_data = data[data[group_col] == group][variable].dropna()
if distribution == 'normal':
# Mean ± SD
mean = group_data.mean()
std = group_data.std()
results[group] = f"{mean:.1f} ± {std:.1f}"
else:
# Median [IQR]
median = group_data.median()
q1 = group_data.quantile(0.25)
q3 = group_data.quantile(0.75)
results[group] = f"{median:.1f} [{q1:.1f}-{q3:.1f}]"
# Calculate p-value
var_type = 'continuous_normal' if distribution == 'normal' else 'continuous_nonnormal'
p_value = calculate_p_value(data, variable, group_col, var_type)
results['p_value'] = f"{p_value:.3f}" if p_value < 0.001 else f"{p_value:.2f}" if p_value < 1.0 else ""
return results
def format_categorical_variable(data, variable, group_col):
"""
Format categorical variable for table display.
Returns:
List of dictionaries for each category with counts and percentages
"""
groups = data[group_col].unique()
categories = data[variable].dropna().unique()
results = []
for category in categories:
row = {'category': category}
for group in groups:
group_data = data[data[group_col] == group]
count = (group_data[variable] == category).sum()
total = group_data[variable].notna().sum()
percentage = (count / total * 100) if total > 0 else 0
row[group] = f"{count} ({percentage:.0f}%)"
results.append(row)
# Calculate p-value for overall categorical variable
p_value = calculate_p_value(data, variable, group_col, 'categorical')
results[0]['p_value'] = f"{p_value:.3f}" if p_value < 0.001 else f"{p_value:.2f}" if p_value < 1.0 else ""
return results
def generate_baseline_table(data, group_col='group', output_file='table1_baseline.csv'):
"""
Generate Table 1: Baseline characteristics.
Customize the variables list for your specific cohort.
"""
groups = data[group_col].unique()
# Initialize results list
table_rows = []
# Header row
header = {
'Characteristic': 'Characteristic',
**{group: f"{group} (n={len(data[data[group_col]==group])})" for group in groups},
'p_value': 'p-value'
}
table_rows.append(header)
# Age (continuous)
if 'age' in data.columns:
age_results = format_continuous_variable(data, 'age', group_col, distribution='nonnormal')
row = {'Characteristic': 'Age, years (median [IQR])'}
for group in groups:
row[group] = age_results[group]
row['p_value'] = age_results['p_value']
table_rows.append(row)
# Sex (categorical)
if 'sex' in data.columns:
table_rows.append({'Characteristic': 'Sex, n (%)', **{g: '' for g in groups}, 'p_value': ''})
sex_results = format_categorical_variable(data, 'sex', group_col)
for sex_row in sex_results:
row = {'Characteristic': f" {sex_row['category']}"}
for group in groups:
row[group] = sex_row[group]
row['p_value'] = sex_row.get('p_value', '')
table_rows.append(row)
# ECOG Performance Status (categorical)
if 'ecog_ps' in data.columns:
table_rows.append({'Characteristic': 'ECOG PS, n (%)', **{g: '' for g in groups}, 'p_value': ''})
ecog_results = format_categorical_variable(data, 'ecog_ps', group_col)
for ecog_row in ecog_results:
row = {'Characteristic': f" {ecog_row['category']}"}
for group in groups:
row[group] = ecog_row[group]
row['p_value'] = ecog_row.get('p_value', '')
table_rows.append(row)
# Convert to DataFrame and save
df_table = pd.DataFrame(table_rows)
df_table.to_csv(output_file, index=False)
print(f"Baseline characteristics table saved to: {output_file}")
return df_table
def generate_efficacy_table(data, group_col='group', output_file='table2_efficacy.csv'):
"""
Generate efficacy outcomes table.
Expected columns:
- best_response: CR, PR, SD, PD
- Additional binary outcomes (response, disease_control, etc.)
"""
groups = data[group_col].unique()
table_rows = []
# Header
header = {
'Outcome': 'Outcome',
**{group: f"{group} (n={len(data[data[group_col]==group])})" for group in groups},
'p_value': 'p-value'
}
table_rows.append(header)
# Objective Response Rate (ORR = CR + PR)
if 'best_response' in data.columns:
for group in groups:
group_data = data[data[group_col] == group]
cr_pr = ((group_data['best_response'] == 'CR') | (group_data['best_response'] == 'PR')).sum()
total = len(group_data)
orr = cr_pr / total * 100
# Calculate exact binomial CI (Clopper-Pearson)
ci_lower, ci_upper = _binomial_ci(cr_pr, total)
if group == groups[0]:
orr_row = {'Outcome': 'ORR, n (%) [95% CI]'}
orr_row[group] = f"{cr_pr} ({orr:.0f}%) [{ci_lower:.0f}-{ci_upper:.0f}]"
# P-value for ORR difference
contingency = pd.crosstab(
data['best_response'].isin(['CR', 'PR']),
data[group_col]
)
_, p_value, _, _ = stats.chi2_contingency(contingency)
orr_row['p_value'] = f"{p_value:.3f}" if p_value >= 0.001 else "<0.001"
table_rows.append(orr_row)
# Individual response categories
for response in ['CR', 'PR', 'SD', 'PD']:
row = {'Outcome': f" {response}"}
for group in groups:
group_data = data[data[group_col] == group]
count = (group_data['best_response'] == response).sum()
total = len(group_data)
pct = count / total * 100
row[group] = f"{count} ({pct:.0f}%)"
row['p_value'] = ''
table_rows.append(row)
# Disease Control Rate (DCR = CR + PR + SD)
if 'best_response' in data.columns:
dcr_row = {'Outcome': 'DCR, n (%) [95% CI]'}
for group in groups:
group_data = data[data[group_col] == group]
dcr_count = group_data['best_response'].isin(['CR', 'PR', 'SD']).sum()
total = len(group_data)
dcr = dcr_count / total * 100
ci_lower, ci_upper = _binomial_ci(dcr_count, total)
dcr_row[group] = f"{dcr_count} ({dcr:.0f}%) [{ci_lower:.0f}-{ci_upper:.0f}]"
# P-value
contingency = pd.crosstab(
data['best_response'].isin(['CR', 'PR', 'SD']),
data[group_col]
)
_, p_value, _, _ = stats.chi2_contingency(contingency)
dcr_row['p_value'] = f"{p_value:.3f}" if p_value >= 0.001 else "<0.001"
table_rows.append(dcr_row)
# Save table
df_table = pd.DataFrame(table_rows)
df_table.to_csv(output_file, index=False)
print(f"Efficacy table saved to: {output_file}")
return df_table
def generate_safety_table(data, ae_columns, group_col='group', output_file='table3_safety.csv'):
"""
Generate adverse events table.
Parameters:
data: DataFrame with AE data
ae_columns: List of AE column names (each should have values 0-5 for CTCAE grades)
group_col: Grouping variable
output_file: Output CSV path
"""
groups = data[group_col].unique()
table_rows = []
# Header
header = {
'Adverse Event': 'Adverse Event',
**{f'{group}_any': f'Any Grade' for group in groups},
**{f'{group}_g34': f'Grade 3-4' for group in groups}
}
for ae in ae_columns:
if ae not in data.columns:
continue
row = {'Adverse Event': ae.replace('_', ' ').title()}
for group in groups:
group_data = data[data[group_col] == group][ae].dropna()
total = len(group_data)
# Any grade (Grade 1-5)
any_grade = (group_data > 0).sum()
any_pct = any_grade / total * 100 if total > 0 else 0
row[f'{group}_any'] = f"{any_grade} ({any_pct:.0f}%)"
# Grade 3-4
grade_34 = (group_data >= 3).sum()
g34_pct = grade_34 / total * 100 if total > 0 else 0
row[f'{group}_g34'] = f"{grade_34} ({g34_pct:.0f}%)"
table_rows.append(row)
# Save table
df_table = pd.DataFrame(table_rows)
df_table.to_csv(output_file, index=False)
print(f"Safety table saved to: {output_file}")
return df_table
def generate_latex_table(df, caption, label='table'):
"""
Convert DataFrame to LaTeX table code.
Returns:
String with LaTeX table code
"""
latex_code = "\\begin{table}[H]\n"
latex_code += "\\centering\n"
latex_code += "\\small\n"
latex_code += "\\begin{tabular}{" + "l" * len(df.columns) + "}\n"
latex_code += "\\toprule\n"
# Header
header_row = " & ".join([f"\\textbf{{{col}}}" for col in df.columns])
latex_code += header_row + " \\\\\n"
latex_code += "\\midrule\n"
# Data rows
for _, row in df.iterrows():
# Handle indentation for subcategories (lines starting with spaces)
first_col = str(row.iloc[0])
if first_col.startswith(' '):
first_col = '\\quad ' + first_col.strip()
data_row = [first_col] + [str(val) if pd.notna(val) else '' for val in row.iloc[1:]]
latex_code += " & ".join(data_row) + " \\\\\n"
latex_code += "\\bottomrule\n"
latex_code += "\\end{tabular}\n"
latex_code += f"\\caption{{{caption}}}\n"
latex_code += f"\\label{{tab:{label}}}\n"
latex_code += "\\end{table}\n"
return latex_code
def _binomial_ci(successes, trials, confidence=0.95):
"""
Calculate exact binomial confidence interval (Clopper-Pearson method).
Returns:
Lower and upper bounds as percentages
"""
if trials == 0:
return 0.0, 0.0
alpha = 1 - confidence
# Use beta distribution
from scipy.stats import beta
if successes == 0:
lower = 0.0
else:
lower = beta.ppf(alpha/2, successes, trials - successes + 1)
if successes == trials:
upper = 1.0
else:
upper = beta.ppf(1 - alpha/2, successes + 1, trials - successes)
return lower * 100, upper * 100
def create_example_data():
"""Create example dataset for testing."""
np.random.seed(42)
n = 100
data = pd.DataFrame({
'patient_id': [f'PT{i:03d}' for i in range(1, n+1)],
'group': np.random.choice(['Biomarker+', 'Biomarker-'], n),
'age': np.random.normal(62, 10, n),
'sex': np.random.choice(['Male', 'Female'], n),
'ecog_ps': np.random.choice(['0-1', '2'], n, p=[0.8, 0.2]),
'stage': np.random.choice(['III', 'IV'], n, p=[0.3, 0.7]),
'best_response': np.random.choice(['CR', 'PR', 'SD', 'PD'], n, p=[0.05, 0.35, 0.40, 0.20]),
'fatigue_grade': np.random.choice([0, 1, 2, 3], n, p=[0.3, 0.4, 0.2, 0.1]),
'nausea_grade': np.random.choice([0, 1, 2, 3], n, p=[0.4, 0.35, 0.20, 0.05]),
'neutropenia_grade': np.random.choice([0, 1, 2, 3, 4], n, p=[0.5, 0.2, 0.15, 0.10, 0.05]),
})
return data
def main():
parser = argparse.ArgumentParser(description='Generate clinical cohort tables')
parser.add_argument('input_file', type=str, nargs='?', default=None,
help='CSV file with cohort data (if not provided, uses example data)')
parser.add_argument('-o', '--output-dir', type=str, default='tables',
help='Output directory (default: tables)')
parser.add_argument('--group-col', type=str, default='group',
help='Column name for grouping variable')
parser.add_argument('--example', action='store_true',
help='Generate tables using example data')
args = parser.parse_args()
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Load or create data
if args.example or args.input_file is None:
print("Generating example dataset...")
data = create_example_data()
else:
print(f"Loading data from {args.input_file}...")
data = pd.read_csv(args.input_file)
print(f"Dataset: {len(data)} patients, {len(data[args.group_col].unique())} groups")
print(f"Groups: {data[args.group_col].value_counts().to_dict()}")
# Generate Table 1: Baseline characteristics
print("\nGenerating baseline characteristics table...")
baseline_table = generate_baseline_table(
data,
group_col=args.group_col,
output_file=output_dir / 'table1_baseline.csv'
)
# Generate LaTeX code for baseline table
latex_code = generate_latex_table(
baseline_table,
caption="Baseline patient demographics and clinical characteristics",
label="baseline"
)
with open(output_dir / 'table1_baseline.tex', 'w') as f:
f.write(latex_code)
print(f"LaTeX code saved to: {output_dir}/table1_baseline.tex")
# Generate Table 2: Efficacy outcomes
if 'best_response' in data.columns:
print("\nGenerating efficacy outcomes table...")
efficacy_table = generate_efficacy_table(
data,
group_col=args.group_col,
output_file=output_dir / 'table2_efficacy.csv'
)
latex_code = generate_latex_table(
efficacy_table,
caption="Treatment efficacy outcomes by group",
label="efficacy"
)
with open(output_dir / 'table2_efficacy.tex', 'w') as f:
f.write(latex_code)
# Generate Table 3: Safety (identify AE columns)
ae_columns = [col for col in data.columns if col.endswith('_grade')]
if ae_columns:
print("\nGenerating safety table...")
safety_table = generate_safety_table(
data,
ae_columns=ae_columns,
group_col=args.group_col,
output_file=output_dir / 'table3_safety.csv'
)
latex_code = generate_latex_table(
safety_table,
caption="Treatment-emergent adverse events by group (CTCAE v5.0)",
label="safety"
)
with open(output_dir / 'table3_safety.tex', 'w') as f:
f.write(latex_code)
print(f"\nAll tables generated successfully in {output_dir}/")
print("Files created:")
print(" - table1_baseline.csv / .tex")
print(" - table2_efficacy.csv / .tex (if response data available)")
print(" - table3_safety.csv / .tex (if AE data available)")
if __name__ == '__main__':
main()
# Example usage:
# python create_cohort_tables.py cohort_data.csv -o tables/
# python create_cohort_tables.py --example # Generate example tables
#
# Input CSV format:
# patient_id,group,age,sex,ecog_ps,stage,best_response,fatigue_grade,nausea_grade,...
# PT001,Biomarker+,65,Male,0-1,IV,PR,1,0,...
# PT002,Biomarker-,58,Female,0-1,III,SD,2,1,...
# ...