#!/usr/bin/env python3 """ Generate Clinical Cohort Tables for Baseline Characteristics and Outcomes Creates publication-ready tables with: - Baseline demographics (Table 1 style) - Efficacy outcomes - Safety/adverse events - Statistical comparisons between groups Dependencies: pandas, numpy, scipy """ import pandas as pd import numpy as np from scipy import stats from pathlib import Path import argparse def calculate_p_value(data, variable, group_col='group', var_type='categorical'): """ Calculate appropriate p-value for group comparison. Parameters: data: DataFrame variable: Column name to compare group_col: Grouping variable var_type: 'categorical', 'continuous_normal', 'continuous_nonnormal' Returns: p-value (float) """ groups = data[group_col].unique() if len(groups) != 2: return np.nan # Only handle 2-group comparisons group1_data = data[data[group_col] == groups[0]][variable].dropna() group2_data = data[data[group_col] == groups[1]][variable].dropna() if var_type == 'categorical': # Chi-square or Fisher's exact test contingency = pd.crosstab(data[variable], data[group_col]) # Check if Fisher's exact is needed (expected count < 5) if contingency.min().min() < 5: # Fisher's exact (2x2 only) if contingency.shape == (2, 2): _, p_value = stats.fisher_exact(contingency) else: # Use chi-square but note limitation _, p_value, _, _ = stats.chi2_contingency(contingency) else: _, p_value, _, _ = stats.chi2_contingency(contingency) elif var_type == 'continuous_normal': # Independent t-test _, p_value = stats.ttest_ind(group1_data, group2_data, equal_var=False) elif var_type == 'continuous_nonnormal': # Mann-Whitney U test _, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative='two-sided') else: raise ValueError("var_type must be 'categorical', 'continuous_normal', or 'continuous_nonnormal'") return p_value def format_continuous_variable(data, variable, group_col, distribution='normal'): """ Format continuous variable for table display. Returns: Dictionary with formatted strings for each group and p-value """ groups = data[group_col].unique() results = {} for group in groups: group_data = data[data[group_col] == group][variable].dropna() if distribution == 'normal': # Mean ± SD mean = group_data.mean() std = group_data.std() results[group] = f"{mean:.1f} ± {std:.1f}" else: # Median [IQR] median = group_data.median() q1 = group_data.quantile(0.25) q3 = group_data.quantile(0.75) results[group] = f"{median:.1f} [{q1:.1f}-{q3:.1f}]" # Calculate p-value var_type = 'continuous_normal' if distribution == 'normal' else 'continuous_nonnormal' p_value = calculate_p_value(data, variable, group_col, var_type) results['p_value'] = f"{p_value:.3f}" if p_value < 0.001 else f"{p_value:.2f}" if p_value < 1.0 else "—" return results def format_categorical_variable(data, variable, group_col): """ Format categorical variable for table display. Returns: List of dictionaries for each category with counts and percentages """ groups = data[group_col].unique() categories = data[variable].dropna().unique() results = [] for category in categories: row = {'category': category} for group in groups: group_data = data[data[group_col] == group] count = (group_data[variable] == category).sum() total = group_data[variable].notna().sum() percentage = (count / total * 100) if total > 0 else 0 row[group] = f"{count} ({percentage:.0f}%)" results.append(row) # Calculate p-value for overall categorical variable p_value = calculate_p_value(data, variable, group_col, 'categorical') results[0]['p_value'] = f"{p_value:.3f}" if p_value < 0.001 else f"{p_value:.2f}" if p_value < 1.0 else "—" return results def generate_baseline_table(data, group_col='group', output_file='table1_baseline.csv'): """ Generate Table 1: Baseline characteristics. Customize the variables list for your specific cohort. """ groups = data[group_col].unique() # Initialize results list table_rows = [] # Header row header = { 'Characteristic': 'Characteristic', **{group: f"{group} (n={len(data[data[group_col]==group])})" for group in groups}, 'p_value': 'p-value' } table_rows.append(header) # Age (continuous) if 'age' in data.columns: age_results = format_continuous_variable(data, 'age', group_col, distribution='nonnormal') row = {'Characteristic': 'Age, years (median [IQR])'} for group in groups: row[group] = age_results[group] row['p_value'] = age_results['p_value'] table_rows.append(row) # Sex (categorical) if 'sex' in data.columns: table_rows.append({'Characteristic': 'Sex, n (%)', **{g: '' for g in groups}, 'p_value': ''}) sex_results = format_categorical_variable(data, 'sex', group_col) for sex_row in sex_results: row = {'Characteristic': f" {sex_row['category']}"} for group in groups: row[group] = sex_row[group] row['p_value'] = sex_row.get('p_value', '') table_rows.append(row) # ECOG Performance Status (categorical) if 'ecog_ps' in data.columns: table_rows.append({'Characteristic': 'ECOG PS, n (%)', **{g: '' for g in groups}, 'p_value': ''}) ecog_results = format_categorical_variable(data, 'ecog_ps', group_col) for ecog_row in ecog_results: row = {'Characteristic': f" {ecog_row['category']}"} for group in groups: row[group] = ecog_row[group] row['p_value'] = ecog_row.get('p_value', '') table_rows.append(row) # Convert to DataFrame and save df_table = pd.DataFrame(table_rows) df_table.to_csv(output_file, index=False) print(f"Baseline characteristics table saved to: {output_file}") return df_table def generate_efficacy_table(data, group_col='group', output_file='table2_efficacy.csv'): """ Generate efficacy outcomes table. Expected columns: - best_response: CR, PR, SD, PD - Additional binary outcomes (response, disease_control, etc.) """ groups = data[group_col].unique() table_rows = [] # Header header = { 'Outcome': 'Outcome', **{group: f"{group} (n={len(data[data[group_col]==group])})" for group in groups}, 'p_value': 'p-value' } table_rows.append(header) # Objective Response Rate (ORR = CR + PR) if 'best_response' in data.columns: for group in groups: group_data = data[data[group_col] == group] cr_pr = ((group_data['best_response'] == 'CR') | (group_data['best_response'] == 'PR')).sum() total = len(group_data) orr = cr_pr / total * 100 # Calculate exact binomial CI (Clopper-Pearson) ci_lower, ci_upper = _binomial_ci(cr_pr, total) if group == groups[0]: orr_row = {'Outcome': 'ORR, n (%) [95% CI]'} orr_row[group] = f"{cr_pr} ({orr:.0f}%) [{ci_lower:.0f}-{ci_upper:.0f}]" # P-value for ORR difference contingency = pd.crosstab( data['best_response'].isin(['CR', 'PR']), data[group_col] ) _, p_value, _, _ = stats.chi2_contingency(contingency) orr_row['p_value'] = f"{p_value:.3f}" if p_value >= 0.001 else "<0.001" table_rows.append(orr_row) # Individual response categories for response in ['CR', 'PR', 'SD', 'PD']: row = {'Outcome': f" {response}"} for group in groups: group_data = data[data[group_col] == group] count = (group_data['best_response'] == response).sum() total = len(group_data) pct = count / total * 100 row[group] = f"{count} ({pct:.0f}%)" row['p_value'] = '' table_rows.append(row) # Disease Control Rate (DCR = CR + PR + SD) if 'best_response' in data.columns: dcr_row = {'Outcome': 'DCR, n (%) [95% CI]'} for group in groups: group_data = data[data[group_col] == group] dcr_count = group_data['best_response'].isin(['CR', 'PR', 'SD']).sum() total = len(group_data) dcr = dcr_count / total * 100 ci_lower, ci_upper = _binomial_ci(dcr_count, total) dcr_row[group] = f"{dcr_count} ({dcr:.0f}%) [{ci_lower:.0f}-{ci_upper:.0f}]" # P-value contingency = pd.crosstab( data['best_response'].isin(['CR', 'PR', 'SD']), data[group_col] ) _, p_value, _, _ = stats.chi2_contingency(contingency) dcr_row['p_value'] = f"{p_value:.3f}" if p_value >= 0.001 else "<0.001" table_rows.append(dcr_row) # Save table df_table = pd.DataFrame(table_rows) df_table.to_csv(output_file, index=False) print(f"Efficacy table saved to: {output_file}") return df_table def generate_safety_table(data, ae_columns, group_col='group', output_file='table3_safety.csv'): """ Generate adverse events table. Parameters: data: DataFrame with AE data ae_columns: List of AE column names (each should have values 0-5 for CTCAE grades) group_col: Grouping variable output_file: Output CSV path """ groups = data[group_col].unique() table_rows = [] # Header header = { 'Adverse Event': 'Adverse Event', **{f'{group}_any': f'Any Grade' for group in groups}, **{f'{group}_g34': f'Grade 3-4' for group in groups} } for ae in ae_columns: if ae not in data.columns: continue row = {'Adverse Event': ae.replace('_', ' ').title()} for group in groups: group_data = data[data[group_col] == group][ae].dropna() total = len(group_data) # Any grade (Grade 1-5) any_grade = (group_data > 0).sum() any_pct = any_grade / total * 100 if total > 0 else 0 row[f'{group}_any'] = f"{any_grade} ({any_pct:.0f}%)" # Grade 3-4 grade_34 = (group_data >= 3).sum() g34_pct = grade_34 / total * 100 if total > 0 else 0 row[f'{group}_g34'] = f"{grade_34} ({g34_pct:.0f}%)" table_rows.append(row) # Save table df_table = pd.DataFrame(table_rows) df_table.to_csv(output_file, index=False) print(f"Safety table saved to: {output_file}") return df_table def generate_latex_table(df, caption, label='table'): """ Convert DataFrame to LaTeX table code. Returns: String with LaTeX table code """ latex_code = "\\begin{table}[H]\n" latex_code += "\\centering\n" latex_code += "\\small\n" latex_code += "\\begin{tabular}{" + "l" * len(df.columns) + "}\n" latex_code += "\\toprule\n" # Header header_row = " & ".join([f"\\textbf{{{col}}}" for col in df.columns]) latex_code += header_row + " \\\\\n" latex_code += "\\midrule\n" # Data rows for _, row in df.iterrows(): # Handle indentation for subcategories (lines starting with spaces) first_col = str(row.iloc[0]) if first_col.startswith(' '): first_col = '\\quad ' + first_col.strip() data_row = [first_col] + [str(val) if pd.notna(val) else '—' for val in row.iloc[1:]] latex_code += " & ".join(data_row) + " \\\\\n" latex_code += "\\bottomrule\n" latex_code += "\\end{tabular}\n" latex_code += f"\\caption{{{caption}}}\n" latex_code += f"\\label{{tab:{label}}}\n" latex_code += "\\end{table}\n" return latex_code def _binomial_ci(successes, trials, confidence=0.95): """ Calculate exact binomial confidence interval (Clopper-Pearson method). Returns: Lower and upper bounds as percentages """ if trials == 0: return 0.0, 0.0 alpha = 1 - confidence # Use beta distribution from scipy.stats import beta if successes == 0: lower = 0.0 else: lower = beta.ppf(alpha/2, successes, trials - successes + 1) if successes == trials: upper = 1.0 else: upper = beta.ppf(1 - alpha/2, successes + 1, trials - successes) return lower * 100, upper * 100 def create_example_data(): """Create example dataset for testing.""" np.random.seed(42) n = 100 data = pd.DataFrame({ 'patient_id': [f'PT{i:03d}' for i in range(1, n+1)], 'group': np.random.choice(['Biomarker+', 'Biomarker-'], n), 'age': np.random.normal(62, 10, n), 'sex': np.random.choice(['Male', 'Female'], n), 'ecog_ps': np.random.choice(['0-1', '2'], n, p=[0.8, 0.2]), 'stage': np.random.choice(['III', 'IV'], n, p=[0.3, 0.7]), 'best_response': np.random.choice(['CR', 'PR', 'SD', 'PD'], n, p=[0.05, 0.35, 0.40, 0.20]), 'fatigue_grade': np.random.choice([0, 1, 2, 3], n, p=[0.3, 0.4, 0.2, 0.1]), 'nausea_grade': np.random.choice([0, 1, 2, 3], n, p=[0.4, 0.35, 0.20, 0.05]), 'neutropenia_grade': np.random.choice([0, 1, 2, 3, 4], n, p=[0.5, 0.2, 0.15, 0.10, 0.05]), }) return data def main(): parser = argparse.ArgumentParser(description='Generate clinical cohort tables') parser.add_argument('input_file', type=str, nargs='?', default=None, help='CSV file with cohort data (if not provided, uses example data)') parser.add_argument('-o', '--output-dir', type=str, default='tables', help='Output directory (default: tables)') parser.add_argument('--group-col', type=str, default='group', help='Column name for grouping variable') parser.add_argument('--example', action='store_true', help='Generate tables using example data') args = parser.parse_args() # Create output directory output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Load or create data if args.example or args.input_file is None: print("Generating example dataset...") data = create_example_data() else: print(f"Loading data from {args.input_file}...") data = pd.read_csv(args.input_file) print(f"Dataset: {len(data)} patients, {len(data[args.group_col].unique())} groups") print(f"Groups: {data[args.group_col].value_counts().to_dict()}") # Generate Table 1: Baseline characteristics print("\nGenerating baseline characteristics table...") baseline_table = generate_baseline_table( data, group_col=args.group_col, output_file=output_dir / 'table1_baseline.csv' ) # Generate LaTeX code for baseline table latex_code = generate_latex_table( baseline_table, caption="Baseline patient demographics and clinical characteristics", label="baseline" ) with open(output_dir / 'table1_baseline.tex', 'w') as f: f.write(latex_code) print(f"LaTeX code saved to: {output_dir}/table1_baseline.tex") # Generate Table 2: Efficacy outcomes if 'best_response' in data.columns: print("\nGenerating efficacy outcomes table...") efficacy_table = generate_efficacy_table( data, group_col=args.group_col, output_file=output_dir / 'table2_efficacy.csv' ) latex_code = generate_latex_table( efficacy_table, caption="Treatment efficacy outcomes by group", label="efficacy" ) with open(output_dir / 'table2_efficacy.tex', 'w') as f: f.write(latex_code) # Generate Table 3: Safety (identify AE columns) ae_columns = [col for col in data.columns if col.endswith('_grade')] if ae_columns: print("\nGenerating safety table...") safety_table = generate_safety_table( data, ae_columns=ae_columns, group_col=args.group_col, output_file=output_dir / 'table3_safety.csv' ) latex_code = generate_latex_table( safety_table, caption="Treatment-emergent adverse events by group (CTCAE v5.0)", label="safety" ) with open(output_dir / 'table3_safety.tex', 'w') as f: f.write(latex_code) print(f"\nAll tables generated successfully in {output_dir}/") print("Files created:") print(" - table1_baseline.csv / .tex") print(" - table2_efficacy.csv / .tex (if response data available)") print(" - table3_safety.csv / .tex (if AE data available)") if __name__ == '__main__': main() # Example usage: # python create_cohort_tables.py cohort_data.csv -o tables/ # python create_cohort_tables.py --example # Generate example tables # # Input CSV format: # patient_id,group,age,sex,ecog_ps,stage,best_response,fatigue_grade,nausea_grade,... # PT001,Biomarker+,65,Male,0-1,IV,PR,1,0,... # PT002,Biomarker-,58,Female,0-1,III,SD,2,1,... # ...