Initial commit

2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions
--- a/skills/clinical-decision-support/scripts/create_cohort_tables.py
+++ b/skills/clinical-decision-support/scripts/create_cohort_tables.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Generate Clinical Cohort Tables for Baseline Characteristics and Outcomes
+
+Creates publication-ready tables with:
+- Baseline demographics (Table 1 style)
+- Efficacy outcomes
+- Safety/adverse events
+- Statistical comparisons between groups
+
+Dependencies: pandas, numpy, scipy
+"""
+
+import pandas as pd
+import numpy as np
+from scipy import stats
+from pathlib import Path
+import argparse
+
+
+def calculate_p_value(data, variable, group_col='group', var_type='categorical'):
+    """
+    Calculate appropriate p-value for group comparison.
+    
+    Parameters:
+        data: DataFrame
+        variable: Column name to compare
+        group_col: Grouping variable
+        var_type: 'categorical', 'continuous_normal', 'continuous_nonnormal'
+    
+    Returns:
+        p-value (float)
+    """
+    
+    groups = data[group_col].unique()
+    
+    if len(groups) != 2:
+        return np.nan  # Only handle 2-group comparisons
+    
+    group1_data = data[data[group_col] == groups[0]][variable].dropna()
+    group2_data = data[data[group_col] == groups[1]][variable].dropna()
+    
+    if var_type == 'categorical':
+        # Chi-square or Fisher's exact test
+        contingency = pd.crosstab(data[variable], data[group_col])
+        
+        # Check if Fisher's exact is needed (expected count < 5)
+        if contingency.min().min() < 5:
+            # Fisher's exact (2x2 only)
+            if contingency.shape == (2, 2):
+                _, p_value = stats.fisher_exact(contingency)
+            else:
+                # Use chi-square but note limitation
+                _, p_value, _, _ = stats.chi2_contingency(contingency)
+        else:
+            _, p_value, _, _ = stats.chi2_contingency(contingency)
+    
+    elif var_type == 'continuous_normal':
+        # Independent t-test
+        _, p_value = stats.ttest_ind(group1_data, group2_data, equal_var=False)
+    
+    elif var_type == 'continuous_nonnormal':
+        # Mann-Whitney U test
+        _, p_value = stats.mannwhitneyu(group1_data, group2_data, alternative='two-sided')
+    
+    else:
+        raise ValueError("var_type must be 'categorical', 'continuous_normal', or 'continuous_nonnormal'")
+    
+    return p_value
+
+
+def format_continuous_variable(data, variable, group_col, distribution='normal'):
+    """
+    Format continuous variable for table display.
+    
+    Returns:
+        Dictionary with formatted strings for each group and p-value
+    """
+    
+    groups = data[group_col].unique()
+    results = {}
+    
+    for group in groups:
+        group_data = data[data[group_col] == group][variable].dropna()
+        
+        if distribution == 'normal':
+            # Mean ± SD
+            mean = group_data.mean()
+            std = group_data.std()
+            results[group] = f"{mean:.1f} ± {std:.1f}"
+        else:
+            # Median [IQR]
+            median = group_data.median()
+            q1 = group_data.quantile(0.25)
+            q3 = group_data.quantile(0.75)
+            results[group] = f"{median:.1f} [{q1:.1f}-{q3:.1f}]"
+    
+    # Calculate p-value
+    var_type = 'continuous_normal' if distribution == 'normal' else 'continuous_nonnormal'
+    p_value = calculate_p_value(data, variable, group_col, var_type)
+    results['p_value'] = f"{p_value:.3f}" if p_value < 0.001 else f"{p_value:.2f}" if p_value < 1.0 else "—"
+    
+    return results
+
+
+def format_categorical_variable(data, variable, group_col):
+    """
+    Format categorical variable for table display.
+    
+    Returns:
+        List of dictionaries for each category with counts and percentages
+    """
+    
+    groups = data[group_col].unique()
+    categories = data[variable].dropna().unique()
+    
+    results = []
+    
+    for category in categories:
+        row = {'category': category}
+        
+        for group in groups:
+            group_data = data[data[group_col] == group]
+            count = (group_data[variable] == category).sum()
+            total = group_data[variable].notna().sum()
+            percentage = (count / total * 100) if total > 0 else 0
+            row[group] = f"{count} ({percentage:.0f}%)"
+        
+        results.append(row)
+    
+    # Calculate p-value for overall categorical variable
+    p_value = calculate_p_value(data, variable, group_col, 'categorical')
+    results[0]['p_value'] = f"{p_value:.3f}" if p_value < 0.001 else f"{p_value:.2f}" if p_value < 1.0 else "—"
+    
+    return results
+
+
+def generate_baseline_table(data, group_col='group', output_file='table1_baseline.csv'):
+    """
+    Generate Table 1: Baseline characteristics.
+    
+    Customize the variables list for your specific cohort.
+    """
+    
+    groups = data[group_col].unique()
+    
+    # Initialize results list
+    table_rows = []
+    
+    # Header row
+    header = {
+        'Characteristic': 'Characteristic',
+        **{group: f"{group} (n={len(data[data[group_col]==group])})" for group in groups},
+        'p_value': 'p-value'
+    }
+    table_rows.append(header)
+    
+    # Age (continuous)
+    if 'age' in data.columns:
+        age_results = format_continuous_variable(data, 'age', group_col, distribution='nonnormal')
+        row = {'Characteristic': 'Age, years (median [IQR])'}
+        for group in groups:
+            row[group] = age_results[group]
+        row['p_value'] = age_results['p_value']
+        table_rows.append(row)
+    
+    # Sex (categorical)
+    if 'sex' in data.columns:
+        table_rows.append({'Characteristic': 'Sex, n (%)', **{g: '' for g in groups}, 'p_value': ''})
+        sex_results = format_categorical_variable(data, 'sex', group_col)
+        for sex_row in sex_results:
+            row = {'Characteristic': f"  {sex_row['category']}"}
+            for group in groups:
+                row[group] = sex_row[group]
+            row['p_value'] = sex_row.get('p_value', '')
+            table_rows.append(row)
+    
+    # ECOG Performance Status (categorical)
+    if 'ecog_ps' in data.columns:
+        table_rows.append({'Characteristic': 'ECOG PS, n (%)', **{g: '' for g in groups}, 'p_value': ''})
+        ecog_results = format_categorical_variable(data, 'ecog_ps', group_col)
+        for ecog_row in ecog_results:
+            row = {'Characteristic': f"  {ecog_row['category']}"}
+            for group in groups:
+                row[group] = ecog_row[group]
+            row['p_value'] = ecog_row.get('p_value', '')
+            table_rows.append(row)
+    
+    # Convert to DataFrame and save
+    df_table = pd.DataFrame(table_rows)
+    df_table.to_csv(output_file, index=False)
+    print(f"Baseline characteristics table saved to: {output_file}")
+    
+    return df_table
+
+
+def generate_efficacy_table(data, group_col='group', output_file='table2_efficacy.csv'):
+    """
+    Generate efficacy outcomes table.
+    
+    Expected columns:
+    - best_response: CR, PR, SD, PD
+    - Additional binary outcomes (response, disease_control, etc.)
+    """
+    
+    groups = data[group_col].unique()
+    table_rows = []
+    
+    # Header
+    header = {
+        'Outcome': 'Outcome',
+        **{group: f"{group} (n={len(data[data[group_col]==group])})" for group in groups},
+        'p_value': 'p-value'
+    }
+    table_rows.append(header)
+    
+    # Objective Response Rate (ORR = CR + PR)
+    if 'best_response' in data.columns:
+        for group in groups:
+            group_data = data[data[group_col] == group]
+            cr_pr = ((group_data['best_response'] == 'CR') | (group_data['best_response'] == 'PR')).sum()
+            total = len(group_data)
+            orr = cr_pr / total * 100
+            
+            # Calculate exact binomial CI (Clopper-Pearson)
+            ci_lower, ci_upper = _binomial_ci(cr_pr, total)
+            
+            if group == groups[0]:
+                orr_row = {'Outcome': 'ORR, n (%) [95% CI]'}
+            
+            orr_row[group] = f"{cr_pr} ({orr:.0f}%) [{ci_lower:.0f}-{ci_upper:.0f}]"
+        
+        # P-value for ORR difference
+        contingency = pd.crosstab(
+            data['best_response'].isin(['CR', 'PR']),
+            data[group_col]
+        )
+        _, p_value, _, _ = stats.chi2_contingency(contingency)
+        orr_row['p_value'] = f"{p_value:.3f}" if p_value >= 0.001 else "<0.001"
+        table_rows.append(orr_row)
+        
+        # Individual response categories
+        for response in ['CR', 'PR', 'SD', 'PD']:
+            row = {'Outcome': f"  {response}"}
+            for group in groups:
+                group_data = data[data[group_col] == group]
+                count = (group_data['best_response'] == response).sum()
+                total = len(group_data)
+                pct = count / total * 100
+                row[group] = f"{count} ({pct:.0f}%)"
+            row['p_value'] = ''
+            table_rows.append(row)
+    
+    # Disease Control Rate (DCR = CR + PR + SD)
+    if 'best_response' in data.columns:
+        dcr_row = {'Outcome': 'DCR, n (%) [95% CI]'}
+        for group in groups:
+            group_data = data[data[group_col] == group]
+            dcr_count = group_data['best_response'].isin(['CR', 'PR', 'SD']).sum()
+            total = len(group_data)
+            dcr = dcr_count / total * 100
+            ci_lower, ci_upper = _binomial_ci(dcr_count, total)
+            dcr_row[group] = f"{dcr_count} ({dcr:.0f}%) [{ci_lower:.0f}-{ci_upper:.0f}]"
+        
+        # P-value
+        contingency = pd.crosstab(
+            data['best_response'].isin(['CR', 'PR', 'SD']),
+            data[group_col]
+        )
+        _, p_value, _, _ = stats.chi2_contingency(contingency)
+        dcr_row['p_value'] = f"{p_value:.3f}" if p_value >= 0.001 else "<0.001"
+        table_rows.append(dcr_row)
+    
+    # Save table
+    df_table = pd.DataFrame(table_rows)
+    df_table.to_csv(output_file, index=False)
+    print(f"Efficacy table saved to: {output_file}")
+    
+    return df_table
+
+
+def generate_safety_table(data, ae_columns, group_col='group', output_file='table3_safety.csv'):
+    """
+    Generate adverse events table.
+    
+    Parameters:
+        data: DataFrame with AE data
+        ae_columns: List of AE column names (each should have values 0-5 for CTCAE grades)
+        group_col: Grouping variable
+        output_file: Output CSV path
+    """
+    
+    groups = data[group_col].unique()
+    table_rows = []
+    
+    # Header
+    header = {
+        'Adverse Event': 'Adverse Event',
+        **{f'{group}_any': f'Any Grade' for group in groups},
+        **{f'{group}_g34': f'Grade 3-4' for group in groups}
+    }
+    
+    for ae in ae_columns:
+        if ae not in data.columns:
+            continue
+        
+        row = {'Adverse Event': ae.replace('_', ' ').title()}
+        
+        for group in groups:
+            group_data = data[data[group_col] == group][ae].dropna()
+            total = len(group_data)
+            
+            # Any grade (Grade 1-5)
+            any_grade = (group_data > 0).sum()
+            any_pct = any_grade / total * 100 if total > 0 else 0
+            row[f'{group}_any'] = f"{any_grade} ({any_pct:.0f}%)"
+            
+            # Grade 3-4
+            grade_34 = (group_data >= 3).sum()
+            g34_pct = grade_34 / total * 100 if total > 0 else 0
+            row[f'{group}_g34'] = f"{grade_34} ({g34_pct:.0f}%)"
+        
+        table_rows.append(row)
+    
+    # Save table
+    df_table = pd.DataFrame(table_rows)
+    df_table.to_csv(output_file, index=False)
+    print(f"Safety table saved to: {output_file}")
+    
+    return df_table
+
+
+def generate_latex_table(df, caption, label='table'):
+    """
+    Convert DataFrame to LaTeX table code.
+    
+    Returns:
+        String with LaTeX table code
+    """
+    
+    latex_code = "\\begin{table}[H]\n"
+    latex_code += "\\centering\n"
+    latex_code += "\\small\n"
+    latex_code += "\\begin{tabular}{" + "l" * len(df.columns) + "}\n"
+    latex_code += "\\toprule\n"
+    
+    # Header
+    header_row = " & ".join([f"\\textbf{{{col}}}" for col in df.columns])
+    latex_code += header_row + " \\\\\n"
+    latex_code += "\\midrule\n"
+    
+    # Data rows
+    for _, row in df.iterrows():
+        # Handle indentation for subcategories (lines starting with spaces)
+        first_col = str(row.iloc[0])
+        if first_col.startswith('  '):
+            first_col = '\\quad ' + first_col.strip()
+        
+        data_row = [first_col] + [str(val) if pd.notna(val) else '—' for val in row.iloc[1:]]
+        latex_code += " & ".join(data_row) + " \\\\\n"
+    
+    latex_code += "\\bottomrule\n"
+    latex_code += "\\end{tabular}\n"
+    latex_code += f"\\caption{{{caption}}}\n"
+    latex_code += f"\\label{{tab:{label}}}\n"
+    latex_code += "\\end{table}\n"
+    
+    return latex_code
+
+
+def _binomial_ci(successes, trials, confidence=0.95):
+    """
+    Calculate exact binomial confidence interval (Clopper-Pearson method).
+    
+    Returns:
+        Lower and upper bounds as percentages
+    """
+    
+    if trials == 0:
+        return 0.0, 0.0
+    
+    alpha = 1 - confidence
+    
+    # Use beta distribution
+    from scipy.stats import beta
+    
+    if successes == 0:
+        lower = 0.0
+    else:
+        lower = beta.ppf(alpha/2, successes, trials - successes + 1)
+    
+    if successes == trials:
+        upper = 1.0
+    else:
+        upper = beta.ppf(1 - alpha/2, successes + 1, trials - successes)
+    
+    return lower * 100, upper * 100
+
+
+def create_example_data():
+    """Create example dataset for testing."""
+    
+    np.random.seed(42)
+    n = 100
+    
+    data = pd.DataFrame({
+        'patient_id': [f'PT{i:03d}' for i in range(1, n+1)],
+        'group': np.random.choice(['Biomarker+', 'Biomarker-'], n),
+        'age': np.random.normal(62, 10, n),
+        'sex': np.random.choice(['Male', 'Female'], n),
+        'ecog_ps': np.random.choice(['0-1', '2'], n, p=[0.8, 0.2]),
+        'stage': np.random.choice(['III', 'IV'], n, p=[0.3, 0.7]),
+        'best_response': np.random.choice(['CR', 'PR', 'SD', 'PD'], n, p=[0.05, 0.35, 0.40, 0.20]),
+        'fatigue_grade': np.random.choice([0, 1, 2, 3], n, p=[0.3, 0.4, 0.2, 0.1]),
+        'nausea_grade': np.random.choice([0, 1, 2, 3], n, p=[0.4, 0.35, 0.20, 0.05]),
+        'neutropenia_grade': np.random.choice([0, 1, 2, 3, 4], n, p=[0.5, 0.2, 0.15, 0.10, 0.05]),
+    })
+    
+    return data
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate clinical cohort tables')
+    parser.add_argument('input_file', type=str, nargs='?', default=None,
+                       help='CSV file with cohort data (if not provided, uses example data)')
+    parser.add_argument('-o', '--output-dir', type=str, default='tables',
+                       help='Output directory (default: tables)')
+    parser.add_argument('--group-col', type=str, default='group',
+                       help='Column name for grouping variable')
+    parser.add_argument('--example', action='store_true',
+                       help='Generate tables using example data')
+    
+    args = parser.parse_args()
+    
+    # Create output directory
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Load or create data
+    if args.example or args.input_file is None:
+        print("Generating example dataset...")
+        data = create_example_data()
+    else:
+        print(f"Loading data from {args.input_file}...")
+        data = pd.read_csv(args.input_file)
+    
+    print(f"Dataset: {len(data)} patients, {len(data[args.group_col].unique())} groups")
+    print(f"Groups: {data[args.group_col].value_counts().to_dict()}")
+    
+    # Generate Table 1: Baseline characteristics
+    print("\nGenerating baseline characteristics table...")
+    baseline_table = generate_baseline_table(
+        data, 
+        group_col=args.group_col,
+        output_file=output_dir / 'table1_baseline.csv'
+    )
+    
+    # Generate LaTeX code for baseline table
+    latex_code = generate_latex_table(
+        baseline_table,
+        caption="Baseline patient demographics and clinical characteristics",
+        label="baseline"
+    )
+    with open(output_dir / 'table1_baseline.tex', 'w') as f:
+        f.write(latex_code)
+    print(f"LaTeX code saved to: {output_dir}/table1_baseline.tex")
+    
+    # Generate Table 2: Efficacy outcomes
+    if 'best_response' in data.columns:
+        print("\nGenerating efficacy outcomes table...")
+        efficacy_table = generate_efficacy_table(
+            data,
+            group_col=args.group_col,
+            output_file=output_dir / 'table2_efficacy.csv'
+        )
+        
+        latex_code = generate_latex_table(
+            efficacy_table,
+            caption="Treatment efficacy outcomes by group",
+            label="efficacy"
+        )
+        with open(output_dir / 'table2_efficacy.tex', 'w') as f:
+            f.write(latex_code)
+    
+    # Generate Table 3: Safety (identify AE columns)
+    ae_columns = [col for col in data.columns if col.endswith('_grade')]
+    if ae_columns:
+        print("\nGenerating safety table...")
+        safety_table = generate_safety_table(
+            data,
+            ae_columns=ae_columns,
+            group_col=args.group_col,
+            output_file=output_dir / 'table3_safety.csv'
+        )
+        
+        latex_code = generate_latex_table(
+            safety_table,
+            caption="Treatment-emergent adverse events by group (CTCAE v5.0)",
+            label="safety"
+        )
+        with open(output_dir / 'table3_safety.tex', 'w') as f:
+            f.write(latex_code)
+    
+    print(f"\nAll tables generated successfully in {output_dir}/")
+    print("Files created:")
+    print("  - table1_baseline.csv / .tex")
+    print("  - table2_efficacy.csv / .tex (if response data available)")
+    print("  - table3_safety.csv / .tex (if AE data available)")
+
+
+if __name__ == '__main__':
+    main()
+
+
+# Example usage:
+# python create_cohort_tables.py cohort_data.csv -o tables/
+# python create_cohort_tables.py --example  # Generate example tables
+#
+# Input CSV format:
+# patient_id,group,age,sex,ecog_ps,stage,best_response,fatigue_grade,nausea_grade,...
+# PT001,Biomarker+,65,Male,0-1,IV,PR,1,0,...
+# PT002,Biomarker-,58,Female,0-1,III,SD,2,1,...
+# ...
+