Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/deeptools/scripts/validate_files.py
+++ b/skills/deeptools/scripts/validate_files.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+deepTools File Validation Script
+
+Validates BAM, bigWig, and BED files for deepTools analysis.
+Checks for file existence, proper indexing, and basic format requirements.
+"""
+
+import os
+import sys
+import argparse
+from pathlib import Path
+
+
+def check_file_exists(filepath):
+    """Check if file exists and is readable."""
+    if not os.path.exists(filepath):
+        return False, f"File not found: {filepath}"
+    if not os.access(filepath, os.R_OK):
+        return False, f"File not readable: {filepath}"
+    return True, f"✓ File exists: {filepath}"
+
+
+def check_bam_index(bam_file):
+    """Check if BAM file has an index (.bai or .bam.bai)."""
+    bai_file1 = bam_file + ".bai"
+    bai_file2 = bam_file.replace(".bam", ".bai")
+
+    if os.path.exists(bai_file1):
+        return True, f"✓ BAM index found: {bai_file1}"
+    elif os.path.exists(bai_file2):
+        return True, f"✓ BAM index found: {bai_file2}"
+    else:
+        return False, f"✗ BAM index missing for: {bam_file}\n  Run: samtools index {bam_file}"
+
+
+def check_bigwig_file(bw_file):
+    """Basic check for bigWig file."""
+    # Check file size (bigWig files should have reasonable size)
+    file_size = os.path.getsize(bw_file)
+    if file_size < 100:
+        return False, f"✗ bigWig file suspiciously small: {bw_file} ({file_size} bytes)"
+    return True, f"✓ bigWig file appears valid: {bw_file} ({file_size} bytes)"
+
+
+def check_bed_file(bed_file):
+    """Basic validation of BED file format."""
+    try:
+        with open(bed_file, 'r') as f:
+            lines = [line.strip() for line in f if line.strip() and not line.startswith('#')]
+
+        if len(lines) == 0:
+            return False, f"✗ BED file is empty: {bed_file}"
+
+        # Check first few lines for basic format
+        for i, line in enumerate(lines[:10], 1):
+            fields = line.split('\t')
+            if len(fields) < 3:
+                return False, f"✗ BED file format error at line {i}: expected at least 3 columns\n  Line: {line}"
+
+            # Check if start and end are integers
+            try:
+                start = int(fields[1])
+                end = int(fields[2])
+                if start >= end:
+                    return False, f"✗ BED file error at line {i}: start >= end ({start} >= {end})"
+            except ValueError:
+                return False, f"✗ BED file format error at line {i}: start and end must be integers\n  Line: {line}"
+
+        return True, f"✓ BED file format appears valid: {bed_file} ({len(lines)} regions)"
+
+    except Exception as e:
+        return False, f"✗ Error reading BED file: {bed_file}\n  Error: {str(e)}"
+
+
+def validate_files(bam_files=None, bigwig_files=None, bed_files=None):
+    """
+    Validate all provided files.
+
+    Args:
+        bam_files: List of BAM file paths
+        bigwig_files: List of bigWig file paths
+        bed_files: List of BED file paths
+
+    Returns:
+        Tuple of (success: bool, messages: list)
+    """
+    all_success = True
+    messages = []
+
+    # Validate BAM files
+    if bam_files:
+        messages.append("\n=== Validating BAM Files ===")
+        for bam_file in bam_files:
+            # Check existence
+            success, msg = check_file_exists(bam_file)
+            messages.append(msg)
+            if not success:
+                all_success = False
+                continue
+
+            # Check index
+            success, msg = check_bam_index(bam_file)
+            messages.append(msg)
+            if not success:
+                all_success = False
+
+    # Validate bigWig files
+    if bigwig_files:
+        messages.append("\n=== Validating bigWig Files ===")
+        for bw_file in bigwig_files:
+            # Check existence
+            success, msg = check_file_exists(bw_file)
+            messages.append(msg)
+            if not success:
+                all_success = False
+                continue
+
+            # Basic bigWig check
+            success, msg = check_bigwig_file(bw_file)
+            messages.append(msg)
+            if not success:
+                all_success = False
+
+    # Validate BED files
+    if bed_files:
+        messages.append("\n=== Validating BED Files ===")
+        for bed_file in bed_files:
+            # Check existence
+            success, msg = check_file_exists(bed_file)
+            messages.append(msg)
+            if not success:
+                all_success = False
+                continue
+
+            # Check BED format
+            success, msg = check_bed_file(bed_file)
+            messages.append(msg)
+            if not success:
+                all_success = False
+
+    return all_success, messages
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate files for deepTools analysis",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Validate BAM files
+  python validate_files.py --bam sample1.bam sample2.bam
+
+  # Validate all file types
+  python validate_files.py --bam input.bam chip.bam --bed peaks.bed --bigwig signal.bw
+
+  # Validate from a directory
+  python validate_files.py --bam *.bam --bed *.bed
+        """
+    )
+
+    parser.add_argument('--bam', nargs='+', help='BAM files to validate')
+    parser.add_argument('--bigwig', '--bw', nargs='+', help='bigWig files to validate')
+    parser.add_argument('--bed', nargs='+', help='BED files to validate')
+
+    args = parser.parse_args()
+
+    # Check if any files were provided
+    if not any([args.bam, args.bigwig, args.bed]):
+        parser.print_help()
+        sys.exit(1)
+
+    # Run validation
+    success, messages = validate_files(
+        bam_files=args.bam,
+        bigwig_files=args.bigwig,
+        bed_files=args.bed
+    )
+
+    # Print results
+    for msg in messages:
+        print(msg)
+
+    # Summary
+    print("\n" + "="*50)
+    if success:
+        print("✓ All validations passed!")
+        sys.exit(0)
+    else:
+        print("✗ Some validations failed. Please fix the issues above.")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/deeptools/scripts/workflow_generator.py
+++ b/skills/deeptools/scripts/workflow_generator.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+"""
+deepTools Workflow Generator
+
+Generates bash script templates for common deepTools workflows.
+"""
+
+import argparse
+import sys
+
+
+WORKFLOWS = {
+    'chipseq_qc': {
+        'name': 'ChIP-seq Quality Control',
+        'description': 'Complete QC workflow for ChIP-seq experiments',
+    },
+    'chipseq_analysis': {
+        'name': 'ChIP-seq Complete Analysis',
+        'description': 'Full ChIP-seq analysis from BAM to heatmaps',
+    },
+    'rnaseq_coverage': {
+        'name': 'RNA-seq Coverage Tracks',
+        'description': 'Generate strand-specific RNA-seq coverage',
+    },
+    'atacseq': {
+        'name': 'ATAC-seq Analysis',
+        'description': 'ATAC-seq workflow with Tn5 correction',
+    },
+}
+
+
+def generate_chipseq_qc_workflow(output_file, params):
+    """Generate ChIP-seq QC workflow script."""
+
+    script = f"""#!/bin/bash
+# deepTools ChIP-seq Quality Control Workflow
+# Generated by deepTools workflow generator
+
+# Configuration
+INPUT_BAM="{params.get('input_bam', 'Input.bam')}"
+CHIP_BAM=("{params.get('chip_bams', 'ChIP1.bam ChIP2.bam')}")
+GENOME_SIZE={params.get('genome_size', '2913022398')}
+THREADS={params.get('threads', '8')}
+OUTPUT_DIR="{params.get('output_dir', 'deeptools_qc')}"
+
+# Create output directory
+mkdir -p $OUTPUT_DIR
+
+echo "=== Starting ChIP-seq QC workflow ==="
+
+# Step 1: Correlation analysis
+echo "Step 1: Computing correlation matrix..."
+multiBamSummary bins \\
+    --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
+    -o $OUTPUT_DIR/readCounts.npz \\
+    --numberOfProcessors $THREADS
+
+echo "Step 2: Generating correlation heatmap..."
+plotCorrelation \\
+    -in $OUTPUT_DIR/readCounts.npz \\
+    --corMethod pearson \\
+    --whatToShow heatmap \\
+    --plotFile $OUTPUT_DIR/correlation_heatmap.png \\
+    --plotNumbers
+
+echo "Step 3: Generating PCA plot..."
+plotPCA \\
+    -in $OUTPUT_DIR/readCounts.npz \\
+    -o $OUTPUT_DIR/PCA_plot.png \\
+    -T "PCA of ChIP-seq samples"
+
+# Step 2: Coverage assessment
+echo "Step 4: Assessing coverage..."
+plotCoverage \\
+    --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
+    --plotFile $OUTPUT_DIR/coverage.png \\
+    --ignoreDuplicates \\
+    --numberOfProcessors $THREADS
+
+# Step 3: Fragment size (for paired-end data)
+echo "Step 5: Analyzing fragment sizes..."
+bamPEFragmentSize \\
+    --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
+    --histogram $OUTPUT_DIR/fragmentSizes.png \\
+    --plotTitle "Fragment Size Distribution"
+
+# Step 4: ChIP signal strength
+echo "Step 6: Evaluating ChIP enrichment..."
+plotFingerprint \\
+    --bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
+    --plotFile $OUTPUT_DIR/fingerprint.png \\
+    --extendReads 200 \\
+    --ignoreDuplicates \\
+    --numberOfProcessors $THREADS \\
+    --outQualityMetrics $OUTPUT_DIR/fingerprint_metrics.txt
+
+echo "=== ChIP-seq QC workflow complete ==="
+echo "Results are in: $OUTPUT_DIR"
+"""
+
+    with open(output_file, 'w') as f:
+        f.write(script)
+
+    return f"✓ Generated ChIP-seq QC workflow: {output_file}"
+
+
+def generate_chipseq_analysis_workflow(output_file, params):
+    """Generate complete ChIP-seq analysis workflow script."""
+
+    script = f"""#!/bin/bash
+# deepTools ChIP-seq Complete Analysis Workflow
+# Generated by deepTools workflow generator
+
+# Configuration
+INPUT_BAM="{params.get('input_bam', 'Input.bam')}"
+CHIP_BAM="{params.get('chip_bam', 'ChIP.bam')}"
+GENES_BED="{params.get('genes_bed', 'genes.bed')}"
+PEAKS_BED="{params.get('peaks_bed', 'peaks.bed')}"
+GENOME_SIZE={params.get('genome_size', '2913022398')}
+THREADS={params.get('threads', '8')}
+OUTPUT_DIR="{params.get('output_dir', 'chipseq_analysis')}"
+
+# Create output directory
+mkdir -p $OUTPUT_DIR
+
+echo "=== Starting ChIP-seq analysis workflow ==="
+
+# Step 1: Generate normalized coverage tracks
+echo "Step 1: Generating coverage tracks..."
+
+bamCoverage \\
+    --bam $INPUT_BAM \\
+    --outFileName $OUTPUT_DIR/Input_coverage.bw \\
+    --normalizeUsing RPGC \\
+    --effectiveGenomeSize $GENOME_SIZE \\
+    --binSize 10 \\
+    --extendReads 200 \\
+    --ignoreDuplicates \\
+    --numberOfProcessors $THREADS
+
+bamCoverage \\
+    --bam $CHIP_BAM \\
+    --outFileName $OUTPUT_DIR/ChIP_coverage.bw \\
+    --normalizeUsing RPGC \\
+    --effectiveGenomeSize $GENOME_SIZE \\
+    --binSize 10 \\
+    --extendReads 200 \\
+    --ignoreDuplicates \\
+    --numberOfProcessors $THREADS
+
+# Step 2: Create log2 ratio track
+echo "Step 2: Creating log2 ratio track..."
+bamCompare \\
+    --bamfile1 $CHIP_BAM \\
+    --bamfile2 $INPUT_BAM \\
+    --outFileName $OUTPUT_DIR/ChIP_vs_Input_log2ratio.bw \\
+    --operation log2 \\
+    --scaleFactorsMethod readCount \\
+    --binSize 10 \\
+    --extendReads 200 \\
+    --ignoreDuplicates \\
+    --numberOfProcessors $THREADS
+
+# Step 3: Compute matrix around TSS
+echo "Step 3: Computing matrix around TSS..."
+computeMatrix reference-point \\
+    --referencePoint TSS \\
+    --scoreFileName $OUTPUT_DIR/ChIP_coverage.bw \\
+    --regionsFileName $GENES_BED \\
+    --beforeRegionStartLength 3000 \\
+    --afterRegionStartLength 3000 \\
+    --binSize 10 \\
+    --sortRegions descend \\
+    --sortUsing mean \\
+    --outFileName $OUTPUT_DIR/matrix_TSS.gz \\
+    --numberOfProcessors $THREADS
+
+# Step 4: Generate heatmap
+echo "Step 4: Generating heatmap..."
+plotHeatmap \\
+    --matrixFile $OUTPUT_DIR/matrix_TSS.gz \\
+    --outFileName $OUTPUT_DIR/heatmap_TSS.png \\
+    --colorMap RdBu \\
+    --whatToShow 'plot, heatmap and colorbar' \\
+    --yAxisLabel "Genes" \\
+    --xAxisLabel "Distance from TSS (bp)" \\
+    --refPointLabel "TSS" \\
+    --heatmapHeight 15 \\
+    --kmeans 3
+
+# Step 5: Generate profile plot
+echo "Step 5: Generating profile plot..."
+plotProfile \\
+    --matrixFile $OUTPUT_DIR/matrix_TSS.gz \\
+    --outFileName $OUTPUT_DIR/profile_TSS.png \\
+    --plotType lines \\
+    --perGroup \\
+    --colors blue \\
+    --plotTitle "ChIP-seq signal around TSS" \\
+    --yAxisLabel "Average signal" \\
+    --refPointLabel "TSS"
+
+# Step 6: Enrichment at peaks (if peaks provided)
+if [ -f "$PEAKS_BED" ]; then
+    echo "Step 6: Calculating enrichment at peaks..."
+    plotEnrichment \\
+        --bamfiles $INPUT_BAM $CHIP_BAM \\
+        --BED $PEAKS_BED \\
+        --labels Input ChIP \\
+        --plotFile $OUTPUT_DIR/enrichment.png \\
+        --outRawCounts $OUTPUT_DIR/enrichment_counts.tab \\
+        --extendReads 200 \\
+        --ignoreDuplicates
+fi
+
+echo "=== ChIP-seq analysis complete ==="
+echo "Results are in: $OUTPUT_DIR"
+"""
+
+    with open(output_file, 'w') as f:
+        f.write(script)
+
+    return f"✓ Generated ChIP-seq analysis workflow: {output_file}"
+
+
+def generate_rnaseq_coverage_workflow(output_file, params):
+    """Generate RNA-seq coverage workflow script."""
+
+    script = f"""#!/bin/bash
+# deepTools RNA-seq Coverage Workflow
+# Generated by deepTools workflow generator
+
+# Configuration
+RNASEQ_BAM="{params.get('rnaseq_bam', 'rnaseq.bam')}"
+THREADS={params.get('threads', '8')}
+OUTPUT_DIR="{params.get('output_dir', 'rnaseq_coverage')}"
+
+# Create output directory
+mkdir -p $OUTPUT_DIR
+
+echo "=== Starting RNA-seq coverage workflow ==="
+
+# Generate strand-specific coverage tracks
+echo "Step 1: Generating forward strand coverage..."
+bamCoverage \\
+    --bam $RNASEQ_BAM \\
+    --outFileName $OUTPUT_DIR/forward_coverage.bw \\
+    --filterRNAstrand forward \\
+    --normalizeUsing CPM \\
+    --binSize 1 \\
+    --numberOfProcessors $THREADS
+
+echo "Step 2: Generating reverse strand coverage..."
+bamCoverage \\
+    --bam $RNASEQ_BAM \\
+    --outFileName $OUTPUT_DIR/reverse_coverage.bw \\
+    --filterRNAstrand reverse \\
+    --normalizeUsing CPM \\
+    --binSize 1 \\
+    --numberOfProcessors $THREADS
+
+echo "=== RNA-seq coverage workflow complete ==="
+echo "Results are in: $OUTPUT_DIR"
+echo ""
+echo "Note: These bigWig files can be loaded into genome browsers"
+echo "for strand-specific visualization of RNA-seq data."
+"""
+
+    with open(output_file, 'w') as f:
+        f.write(script)
+
+    return f"✓ Generated RNA-seq coverage workflow: {output_file}"
+
+
+def generate_atacseq_workflow(output_file, params):
+    """Generate ATAC-seq workflow script."""
+
+    script = f"""#!/bin/bash
+# deepTools ATAC-seq Analysis Workflow
+# Generated by deepTools workflow generator
+
+# Configuration
+ATAC_BAM="{params.get('atac_bam', 'atacseq.bam')}"
+PEAKS_BED="{params.get('peaks_bed', 'peaks.bed')}"
+GENOME_SIZE={params.get('genome_size', '2913022398')}
+THREADS={params.get('threads', '8')}
+OUTPUT_DIR="{params.get('output_dir', 'atacseq_analysis')}"
+
+# Create output directory
+mkdir -p $OUTPUT_DIR
+
+echo "=== Starting ATAC-seq analysis workflow ==="
+
+# Step 1: Shift reads for Tn5 correction
+echo "Step 1: Applying Tn5 offset correction..."
+alignmentSieve \\
+    --bam $ATAC_BAM \\
+    --outFile $OUTPUT_DIR/atacseq_shifted.bam \\
+    --ATACshift \\
+    --minFragmentLength 38 \\
+    --maxFragmentLength 2000 \\
+    --ignoreDuplicates
+
+# Index the shifted BAM
+samtools index $OUTPUT_DIR/atacseq_shifted.bam
+
+# Step 2: Generate coverage track
+echo "Step 2: Generating coverage track..."
+bamCoverage \\
+    --bam $OUTPUT_DIR/atacseq_shifted.bam \\
+    --outFileName $OUTPUT_DIR/atacseq_coverage.bw \\
+    --normalizeUsing RPGC \\
+    --effectiveGenomeSize $GENOME_SIZE \\
+    --binSize 1 \\
+    --numberOfProcessors $THREADS
+
+# Step 3: Fragment size analysis
+echo "Step 3: Analyzing fragment sizes..."
+bamPEFragmentSize \\
+    --bamfiles $ATAC_BAM \\
+    --histogram $OUTPUT_DIR/fragmentSizes.png \\
+    --maxFragmentLength 1000
+
+# Step 4: Compute matrix at peaks (if peaks provided)
+if [ -f "$PEAKS_BED" ]; then
+    echo "Step 4: Computing matrix at peaks..."
+    computeMatrix reference-point \\
+        --referencePoint center \\
+        --scoreFileName $OUTPUT_DIR/atacseq_coverage.bw \\
+        --regionsFileName $PEAKS_BED \\
+        --beforeRegionStartLength 2000 \\
+        --afterRegionStartLength 2000 \\
+        --binSize 10 \\
+        --outFileName $OUTPUT_DIR/matrix_peaks.gz \\
+        --numberOfProcessors $THREADS
+
+    echo "Step 5: Generating heatmap..."
+    plotHeatmap \\
+        --matrixFile $OUTPUT_DIR/matrix_peaks.gz \\
+        --outFileName $OUTPUT_DIR/heatmap_peaks.png \\
+        --colorMap YlOrRd \\
+        --refPointLabel "Peak Center" \\
+        --heatmapHeight 15
+fi
+
+echo "=== ATAC-seq analysis complete ==="
+echo "Results are in: $OUTPUT_DIR"
+echo ""
+echo "Expected fragment size pattern:"
+echo "  ~50bp: nucleosome-free regions"
+echo "  ~200bp: mono-nucleosome"
+echo "  ~400bp: di-nucleosome"
+"""
+
+    with open(output_file, 'w') as f:
+        f.write(script)
+
+    return f"✓ Generated ATAC-seq workflow: {output_file}"
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate deepTools workflow scripts",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=f"""
+Available workflows:
+{chr(10).join(f"  {key}: {value['name']}" for key, value in WORKFLOWS.items())}
+
+Examples:
+  # Generate ChIP-seq QC workflow
+  python workflow_generator.py chipseq_qc -o chipseq_qc.sh
+
+  # Generate ChIP-seq analysis with custom parameters
+  python workflow_generator.py chipseq_analysis -o analysis.sh \\
+      --chip-bam H3K4me3.bam --input-bam Input.bam
+
+  # List all available workflows
+  python workflow_generator.py --list
+        """
+    )
+
+    parser.add_argument('workflow', nargs='?', choices=list(WORKFLOWS.keys()),
+                        help='Workflow type to generate')
+    parser.add_argument('-o', '--output', default='deeptools_workflow.sh',
+                        help='Output script filename (default: deeptools_workflow.sh)')
+    parser.add_argument('--list', action='store_true',
+                        help='List all available workflows')
+
+    # Common parameters
+    parser.add_argument('--threads', type=int, default=8,
+                        help='Number of threads (default: 8)')
+    parser.add_argument('--genome-size', type=int, default=2913022398,
+                        help='Effective genome size (default: 2913022398 for hg38)')
+    parser.add_argument('--output-dir', default=None,
+                        help='Output directory for results')
+
+    # Workflow-specific parameters
+    parser.add_argument('--input-bam', help='Input/control BAM file')
+    parser.add_argument('--chip-bam', help='ChIP BAM file')
+    parser.add_argument('--chip-bams', help='Multiple ChIP BAM files (space-separated)')
+    parser.add_argument('--rnaseq-bam', help='RNA-seq BAM file')
+    parser.add_argument('--atac-bam', help='ATAC-seq BAM file')
+    parser.add_argument('--genes-bed', help='Genes BED file')
+    parser.add_argument('--peaks-bed', help='Peaks BED file')
+
+    args = parser.parse_args()
+
+    # List workflows
+    if args.list:
+        print("\nAvailable deepTools workflows:\n")
+        for key, value in WORKFLOWS.items():
+            print(f"  {key}")
+            print(f"    {value['name']}")
+            print(f"    {value['description']}\n")
+        sys.exit(0)
+
+    # Check if workflow was specified
+    if not args.workflow:
+        parser.print_help()
+        sys.exit(1)
+
+    # Prepare parameters
+    params = {
+        'threads': args.threads,
+        'genome_size': args.genome_size,
+        'output_dir': args.output_dir or f"{args.workflow}_output",
+        'input_bam': args.input_bam,
+        'chip_bam': args.chip_bam,
+        'chip_bams': args.chip_bams,
+        'rnaseq_bam': args.rnaseq_bam,
+        'atac_bam': args.atac_bam,
+        'genes_bed': args.genes_bed,
+        'peaks_bed': args.peaks_bed,
+    }
+
+    # Generate workflow
+    if args.workflow == 'chipseq_qc':
+        message = generate_chipseq_qc_workflow(args.output, params)
+    elif args.workflow == 'chipseq_analysis':
+        message = generate_chipseq_analysis_workflow(args.output, params)
+    elif args.workflow == 'rnaseq_coverage':
+        message = generate_rnaseq_coverage_workflow(args.output, params)
+    elif args.workflow == 'atacseq':
+        message = generate_atacseq_workflow(args.output, params)
+
+    print(message)
+    print(f"\nTo run the workflow:")
+    print(f"  chmod +x {args.output}")
+    print(f"  ./{args.output}")
+    print(f"\nNote: Edit the script to customize file paths and parameters.")
+
+
+if __name__ == "__main__":
+    main()