#!/usr/bin/env python3 """ deepTools Workflow Generator Generates bash script templates for common deepTools workflows. """ import argparse import sys WORKFLOWS = { 'chipseq_qc': { 'name': 'ChIP-seq Quality Control', 'description': 'Complete QC workflow for ChIP-seq experiments', }, 'chipseq_analysis': { 'name': 'ChIP-seq Complete Analysis', 'description': 'Full ChIP-seq analysis from BAM to heatmaps', }, 'rnaseq_coverage': { 'name': 'RNA-seq Coverage Tracks', 'description': 'Generate strand-specific RNA-seq coverage', }, 'atacseq': { 'name': 'ATAC-seq Analysis', 'description': 'ATAC-seq workflow with Tn5 correction', }, } def generate_chipseq_qc_workflow(output_file, params): """Generate ChIP-seq QC workflow script.""" script = f"""#!/bin/bash # deepTools ChIP-seq Quality Control Workflow # Generated by deepTools workflow generator # Configuration INPUT_BAM="{params.get('input_bam', 'Input.bam')}" CHIP_BAM=("{params.get('chip_bams', 'ChIP1.bam ChIP2.bam')}") GENOME_SIZE={params.get('genome_size', '2913022398')} THREADS={params.get('threads', '8')} OUTPUT_DIR="{params.get('output_dir', 'deeptools_qc')}" # Create output directory mkdir -p $OUTPUT_DIR echo "=== Starting ChIP-seq QC workflow ===" # Step 1: Correlation analysis echo "Step 1: Computing correlation matrix..." multiBamSummary bins \\ --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\ -o $OUTPUT_DIR/readCounts.npz \\ --numberOfProcessors $THREADS echo "Step 2: Generating correlation heatmap..." plotCorrelation \\ -in $OUTPUT_DIR/readCounts.npz \\ --corMethod pearson \\ --whatToShow heatmap \\ --plotFile $OUTPUT_DIR/correlation_heatmap.png \\ --plotNumbers echo "Step 3: Generating PCA plot..." plotPCA \\ -in $OUTPUT_DIR/readCounts.npz \\ -o $OUTPUT_DIR/PCA_plot.png \\ -T "PCA of ChIP-seq samples" # Step 2: Coverage assessment echo "Step 4: Assessing coverage..." plotCoverage \\ --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\ --plotFile $OUTPUT_DIR/coverage.png \\ --ignoreDuplicates \\ --numberOfProcessors $THREADS # Step 3: Fragment size (for paired-end data) echo "Step 5: Analyzing fragment sizes..." bamPEFragmentSize \\ --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\ --histogram $OUTPUT_DIR/fragmentSizes.png \\ --plotTitle "Fragment Size Distribution" # Step 4: ChIP signal strength echo "Step 6: Evaluating ChIP enrichment..." plotFingerprint \\ --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\ --plotFile $OUTPUT_DIR/fingerprint.png \\ --extendReads 200 \\ --ignoreDuplicates \\ --numberOfProcessors $THREADS \\ --outQualityMetrics $OUTPUT_DIR/fingerprint_metrics.txt echo "=== ChIP-seq QC workflow complete ===" echo "Results are in: $OUTPUT_DIR" """ with open(output_file, 'w') as f: f.write(script) return f"✓ Generated ChIP-seq QC workflow: {output_file}" def generate_chipseq_analysis_workflow(output_file, params): """Generate complete ChIP-seq analysis workflow script.""" script = f"""#!/bin/bash # deepTools ChIP-seq Complete Analysis Workflow # Generated by deepTools workflow generator # Configuration INPUT_BAM="{params.get('input_bam', 'Input.bam')}" CHIP_BAM="{params.get('chip_bam', 'ChIP.bam')}" GENES_BED="{params.get('genes_bed', 'genes.bed')}" PEAKS_BED="{params.get('peaks_bed', 'peaks.bed')}" GENOME_SIZE={params.get('genome_size', '2913022398')} THREADS={params.get('threads', '8')} OUTPUT_DIR="{params.get('output_dir', 'chipseq_analysis')}" # Create output directory mkdir -p $OUTPUT_DIR echo "=== Starting ChIP-seq analysis workflow ===" # Step 1: Generate normalized coverage tracks echo "Step 1: Generating coverage tracks..." bamCoverage \\ --bam $INPUT_BAM \\ --outFileName $OUTPUT_DIR/Input_coverage.bw \\ --normalizeUsing RPGC \\ --effectiveGenomeSize $GENOME_SIZE \\ --binSize 10 \\ --extendReads 200 \\ --ignoreDuplicates \\ --numberOfProcessors $THREADS bamCoverage \\ --bam $CHIP_BAM \\ --outFileName $OUTPUT_DIR/ChIP_coverage.bw \\ --normalizeUsing RPGC \\ --effectiveGenomeSize $GENOME_SIZE \\ --binSize 10 \\ --extendReads 200 \\ --ignoreDuplicates \\ --numberOfProcessors $THREADS # Step 2: Create log2 ratio track echo "Step 2: Creating log2 ratio track..." bamCompare \\ --bamfile1 $CHIP_BAM \\ --bamfile2 $INPUT_BAM \\ --outFileName $OUTPUT_DIR/ChIP_vs_Input_log2ratio.bw \\ --operation log2 \\ --scaleFactorsMethod readCount \\ --binSize 10 \\ --extendReads 200 \\ --ignoreDuplicates \\ --numberOfProcessors $THREADS # Step 3: Compute matrix around TSS echo "Step 3: Computing matrix around TSS..." computeMatrix reference-point \\ --referencePoint TSS \\ --scoreFileName $OUTPUT_DIR/ChIP_coverage.bw \\ --regionsFileName $GENES_BED \\ --beforeRegionStartLength 3000 \\ --afterRegionStartLength 3000 \\ --binSize 10 \\ --sortRegions descend \\ --sortUsing mean \\ --outFileName $OUTPUT_DIR/matrix_TSS.gz \\ --numberOfProcessors $THREADS # Step 4: Generate heatmap echo "Step 4: Generating heatmap..." plotHeatmap \\ --matrixFile $OUTPUT_DIR/matrix_TSS.gz \\ --outFileName $OUTPUT_DIR/heatmap_TSS.png \\ --colorMap RdBu \\ --whatToShow 'plot, heatmap and colorbar' \\ --yAxisLabel "Genes" \\ --xAxisLabel "Distance from TSS (bp)" \\ --refPointLabel "TSS" \\ --heatmapHeight 15 \\ --kmeans 3 # Step 5: Generate profile plot echo "Step 5: Generating profile plot..." plotProfile \\ --matrixFile $OUTPUT_DIR/matrix_TSS.gz \\ --outFileName $OUTPUT_DIR/profile_TSS.png \\ --plotType lines \\ --perGroup \\ --colors blue \\ --plotTitle "ChIP-seq signal around TSS" \\ --yAxisLabel "Average signal" \\ --refPointLabel "TSS" # Step 6: Enrichment at peaks (if peaks provided) if [ -f "$PEAKS_BED" ]; then echo "Step 6: Calculating enrichment at peaks..." plotEnrichment \\ --bamfiles $INPUT_BAM $CHIP_BAM \\ --BED $PEAKS_BED \\ --labels Input ChIP \\ --plotFile $OUTPUT_DIR/enrichment.png \\ --outRawCounts $OUTPUT_DIR/enrichment_counts.tab \\ --extendReads 200 \\ --ignoreDuplicates fi echo "=== ChIP-seq analysis complete ===" echo "Results are in: $OUTPUT_DIR" """ with open(output_file, 'w') as f: f.write(script) return f"✓ Generated ChIP-seq analysis workflow: {output_file}" def generate_rnaseq_coverage_workflow(output_file, params): """Generate RNA-seq coverage workflow script.""" script = f"""#!/bin/bash # deepTools RNA-seq Coverage Workflow # Generated by deepTools workflow generator # Configuration RNASEQ_BAM="{params.get('rnaseq_bam', 'rnaseq.bam')}" THREADS={params.get('threads', '8')} OUTPUT_DIR="{params.get('output_dir', 'rnaseq_coverage')}" # Create output directory mkdir -p $OUTPUT_DIR echo "=== Starting RNA-seq coverage workflow ===" # Generate strand-specific coverage tracks echo "Step 1: Generating forward strand coverage..." bamCoverage \\ --bam $RNASEQ_BAM \\ --outFileName $OUTPUT_DIR/forward_coverage.bw \\ --filterRNAstrand forward \\ --normalizeUsing CPM \\ --binSize 1 \\ --numberOfProcessors $THREADS echo "Step 2: Generating reverse strand coverage..." bamCoverage \\ --bam $RNASEQ_BAM \\ --outFileName $OUTPUT_DIR/reverse_coverage.bw \\ --filterRNAstrand reverse \\ --normalizeUsing CPM \\ --binSize 1 \\ --numberOfProcessors $THREADS echo "=== RNA-seq coverage workflow complete ===" echo "Results are in: $OUTPUT_DIR" echo "" echo "Note: These bigWig files can be loaded into genome browsers" echo "for strand-specific visualization of RNA-seq data." """ with open(output_file, 'w') as f: f.write(script) return f"✓ Generated RNA-seq coverage workflow: {output_file}" def generate_atacseq_workflow(output_file, params): """Generate ATAC-seq workflow script.""" script = f"""#!/bin/bash # deepTools ATAC-seq Analysis Workflow # Generated by deepTools workflow generator # Configuration ATAC_BAM="{params.get('atac_bam', 'atacseq.bam')}" PEAKS_BED="{params.get('peaks_bed', 'peaks.bed')}" GENOME_SIZE={params.get('genome_size', '2913022398')} THREADS={params.get('threads', '8')} OUTPUT_DIR="{params.get('output_dir', 'atacseq_analysis')}" # Create output directory mkdir -p $OUTPUT_DIR echo "=== Starting ATAC-seq analysis workflow ===" # Step 1: Shift reads for Tn5 correction echo "Step 1: Applying Tn5 offset correction..." alignmentSieve \\ --bam $ATAC_BAM \\ --outFile $OUTPUT_DIR/atacseq_shifted.bam \\ --ATACshift \\ --minFragmentLength 38 \\ --maxFragmentLength 2000 \\ --ignoreDuplicates # Index the shifted BAM samtools index $OUTPUT_DIR/atacseq_shifted.bam # Step 2: Generate coverage track echo "Step 2: Generating coverage track..." bamCoverage \\ --bam $OUTPUT_DIR/atacseq_shifted.bam \\ --outFileName $OUTPUT_DIR/atacseq_coverage.bw \\ --normalizeUsing RPGC \\ --effectiveGenomeSize $GENOME_SIZE \\ --binSize 1 \\ --numberOfProcessors $THREADS # Step 3: Fragment size analysis echo "Step 3: Analyzing fragment sizes..." bamPEFragmentSize \\ --bamfiles $ATAC_BAM \\ --histogram $OUTPUT_DIR/fragmentSizes.png \\ --maxFragmentLength 1000 # Step 4: Compute matrix at peaks (if peaks provided) if [ -f "$PEAKS_BED" ]; then echo "Step 4: Computing matrix at peaks..." computeMatrix reference-point \\ --referencePoint center \\ --scoreFileName $OUTPUT_DIR/atacseq_coverage.bw \\ --regionsFileName $PEAKS_BED \\ --beforeRegionStartLength 2000 \\ --afterRegionStartLength 2000 \\ --binSize 10 \\ --outFileName $OUTPUT_DIR/matrix_peaks.gz \\ --numberOfProcessors $THREADS echo "Step 5: Generating heatmap..." plotHeatmap \\ --matrixFile $OUTPUT_DIR/matrix_peaks.gz \\ --outFileName $OUTPUT_DIR/heatmap_peaks.png \\ --colorMap YlOrRd \\ --refPointLabel "Peak Center" \\ --heatmapHeight 15 fi echo "=== ATAC-seq analysis complete ===" echo "Results are in: $OUTPUT_DIR" echo "" echo "Expected fragment size pattern:" echo " ~50bp: nucleosome-free regions" echo " ~200bp: mono-nucleosome" echo " ~400bp: di-nucleosome" """ with open(output_file, 'w') as f: f.write(script) return f"✓ Generated ATAC-seq workflow: {output_file}" def main(): parser = argparse.ArgumentParser( description="Generate deepTools workflow scripts", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=f""" Available workflows: {chr(10).join(f" {key}: {value['name']}" for key, value in WORKFLOWS.items())} Examples: # Generate ChIP-seq QC workflow python workflow_generator.py chipseq_qc -o chipseq_qc.sh # Generate ChIP-seq analysis with custom parameters python workflow_generator.py chipseq_analysis -o analysis.sh \\ --chip-bam H3K4me3.bam --input-bam Input.bam # List all available workflows python workflow_generator.py --list """ ) parser.add_argument('workflow', nargs='?', choices=list(WORKFLOWS.keys()), help='Workflow type to generate') parser.add_argument('-o', '--output', default='deeptools_workflow.sh', help='Output script filename (default: deeptools_workflow.sh)') parser.add_argument('--list', action='store_true', help='List all available workflows') # Common parameters parser.add_argument('--threads', type=int, default=8, help='Number of threads (default: 8)') parser.add_argument('--genome-size', type=int, default=2913022398, help='Effective genome size (default: 2913022398 for hg38)') parser.add_argument('--output-dir', default=None, help='Output directory for results') # Workflow-specific parameters parser.add_argument('--input-bam', help='Input/control BAM file') parser.add_argument('--chip-bam', help='ChIP BAM file') parser.add_argument('--chip-bams', help='Multiple ChIP BAM files (space-separated)') parser.add_argument('--rnaseq-bam', help='RNA-seq BAM file') parser.add_argument('--atac-bam', help='ATAC-seq BAM file') parser.add_argument('--genes-bed', help='Genes BED file') parser.add_argument('--peaks-bed', help='Peaks BED file') args = parser.parse_args() # List workflows if args.list: print("\nAvailable deepTools workflows:\n") for key, value in WORKFLOWS.items(): print(f" {key}") print(f" {value['name']}") print(f" {value['description']}\n") sys.exit(0) # Check if workflow was specified if not args.workflow: parser.print_help() sys.exit(1) # Prepare parameters params = { 'threads': args.threads, 'genome_size': args.genome_size, 'output_dir': args.output_dir or f"{args.workflow}_output", 'input_bam': args.input_bam, 'chip_bam': args.chip_bam, 'chip_bams': args.chip_bams, 'rnaseq_bam': args.rnaseq_bam, 'atac_bam': args.atac_bam, 'genes_bed': args.genes_bed, 'peaks_bed': args.peaks_bed, } # Generate workflow if args.workflow == 'chipseq_qc': message = generate_chipseq_qc_workflow(args.output, params) elif args.workflow == 'chipseq_analysis': message = generate_chipseq_analysis_workflow(args.output, params) elif args.workflow == 'rnaseq_coverage': message = generate_rnaseq_coverage_workflow(args.output, params) elif args.workflow == 'atacseq': message = generate_atacseq_workflow(args.output, params) print(message) print(f"\nTo run the workflow:") print(f" chmod +x {args.output}") print(f" ./{args.output}") print(f"\nNote: Edit the script to customize file paths and parameters.") if __name__ == "__main__": main()