Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""
deepTools File Validation Script
Validates BAM, bigWig, and BED files for deepTools analysis.
Checks for file existence, proper indexing, and basic format requirements.
"""
import os
import sys
import argparse
from pathlib import Path
def check_file_exists(filepath):
"""Check if file exists and is readable."""
if not os.path.exists(filepath):
return False, f"File not found: {filepath}"
if not os.access(filepath, os.R_OK):
return False, f"File not readable: {filepath}"
return True, f"✓ File exists: {filepath}"
def check_bam_index(bam_file):
"""Check if BAM file has an index (.bai or .bam.bai)."""
bai_file1 = bam_file + ".bai"
bai_file2 = bam_file.replace(".bam", ".bai")
if os.path.exists(bai_file1):
return True, f"✓ BAM index found: {bai_file1}"
elif os.path.exists(bai_file2):
return True, f"✓ BAM index found: {bai_file2}"
else:
return False, f"✗ BAM index missing for: {bam_file}\n Run: samtools index {bam_file}"
def check_bigwig_file(bw_file):
"""Basic check for bigWig file."""
# Check file size (bigWig files should have reasonable size)
file_size = os.path.getsize(bw_file)
if file_size < 100:
return False, f"✗ bigWig file suspiciously small: {bw_file} ({file_size} bytes)"
return True, f"✓ bigWig file appears valid: {bw_file} ({file_size} bytes)"
def check_bed_file(bed_file):
"""Basic validation of BED file format."""
try:
with open(bed_file, 'r') as f:
lines = [line.strip() for line in f if line.strip() and not line.startswith('#')]
if len(lines) == 0:
return False, f"✗ BED file is empty: {bed_file}"
# Check first few lines for basic format
for i, line in enumerate(lines[:10], 1):
fields = line.split('\t')
if len(fields) < 3:
return False, f"✗ BED file format error at line {i}: expected at least 3 columns\n Line: {line}"
# Check if start and end are integers
try:
start = int(fields[1])
end = int(fields[2])
if start >= end:
return False, f"✗ BED file error at line {i}: start >= end ({start} >= {end})"
except ValueError:
return False, f"✗ BED file format error at line {i}: start and end must be integers\n Line: {line}"
return True, f"✓ BED file format appears valid: {bed_file} ({len(lines)} regions)"
except Exception as e:
return False, f"✗ Error reading BED file: {bed_file}\n Error: {str(e)}"
def validate_files(bam_files=None, bigwig_files=None, bed_files=None):
"""
Validate all provided files.
Args:
bam_files: List of BAM file paths
bigwig_files: List of bigWig file paths
bed_files: List of BED file paths
Returns:
Tuple of (success: bool, messages: list)
"""
all_success = True
messages = []
# Validate BAM files
if bam_files:
messages.append("\n=== Validating BAM Files ===")
for bam_file in bam_files:
# Check existence
success, msg = check_file_exists(bam_file)
messages.append(msg)
if not success:
all_success = False
continue
# Check index
success, msg = check_bam_index(bam_file)
messages.append(msg)
if not success:
all_success = False
# Validate bigWig files
if bigwig_files:
messages.append("\n=== Validating bigWig Files ===")
for bw_file in bigwig_files:
# Check existence
success, msg = check_file_exists(bw_file)
messages.append(msg)
if not success:
all_success = False
continue
# Basic bigWig check
success, msg = check_bigwig_file(bw_file)
messages.append(msg)
if not success:
all_success = False
# Validate BED files
if bed_files:
messages.append("\n=== Validating BED Files ===")
for bed_file in bed_files:
# Check existence
success, msg = check_file_exists(bed_file)
messages.append(msg)
if not success:
all_success = False
continue
# Check BED format
success, msg = check_bed_file(bed_file)
messages.append(msg)
if not success:
all_success = False
return all_success, messages
def main():
parser = argparse.ArgumentParser(
description="Validate files for deepTools analysis",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Validate BAM files
python validate_files.py --bam sample1.bam sample2.bam
# Validate all file types
python validate_files.py --bam input.bam chip.bam --bed peaks.bed --bigwig signal.bw
# Validate from a directory
python validate_files.py --bam *.bam --bed *.bed
"""
)
parser.add_argument('--bam', nargs='+', help='BAM files to validate')
parser.add_argument('--bigwig', '--bw', nargs='+', help='bigWig files to validate')
parser.add_argument('--bed', nargs='+', help='BED files to validate')
args = parser.parse_args()
# Check if any files were provided
if not any([args.bam, args.bigwig, args.bed]):
parser.print_help()
sys.exit(1)
# Run validation
success, messages = validate_files(
bam_files=args.bam,
bigwig_files=args.bigwig,
bed_files=args.bed
)
# Print results
for msg in messages:
print(msg)
# Summary
print("\n" + "="*50)
if success:
print("✓ All validations passed!")
sys.exit(0)
else:
print("✗ Some validations failed. Please fix the issues above.")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,454 @@
#!/usr/bin/env python3
"""
deepTools Workflow Generator
Generates bash script templates for common deepTools workflows.
"""
import argparse
import sys
WORKFLOWS = {
'chipseq_qc': {
'name': 'ChIP-seq Quality Control',
'description': 'Complete QC workflow for ChIP-seq experiments',
},
'chipseq_analysis': {
'name': 'ChIP-seq Complete Analysis',
'description': 'Full ChIP-seq analysis from BAM to heatmaps',
},
'rnaseq_coverage': {
'name': 'RNA-seq Coverage Tracks',
'description': 'Generate strand-specific RNA-seq coverage',
},
'atacseq': {
'name': 'ATAC-seq Analysis',
'description': 'ATAC-seq workflow with Tn5 correction',
},
}
def generate_chipseq_qc_workflow(output_file, params):
"""Generate ChIP-seq QC workflow script."""
script = f"""#!/bin/bash
# deepTools ChIP-seq Quality Control Workflow
# Generated by deepTools workflow generator
# Configuration
INPUT_BAM="{params.get('input_bam', 'Input.bam')}"
CHIP_BAM=("{params.get('chip_bams', 'ChIP1.bam ChIP2.bam')}")
GENOME_SIZE={params.get('genome_size', '2913022398')}
THREADS={params.get('threads', '8')}
OUTPUT_DIR="{params.get('output_dir', 'deeptools_qc')}"
# Create output directory
mkdir -p $OUTPUT_DIR
echo "=== Starting ChIP-seq QC workflow ==="
# Step 1: Correlation analysis
echo "Step 1: Computing correlation matrix..."
multiBamSummary bins \\
--bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
-o $OUTPUT_DIR/readCounts.npz \\
--numberOfProcessors $THREADS
echo "Step 2: Generating correlation heatmap..."
plotCorrelation \\
-in $OUTPUT_DIR/readCounts.npz \\
--corMethod pearson \\
--whatToShow heatmap \\
--plotFile $OUTPUT_DIR/correlation_heatmap.png \\
--plotNumbers
echo "Step 3: Generating PCA plot..."
plotPCA \\
-in $OUTPUT_DIR/readCounts.npz \\
-o $OUTPUT_DIR/PCA_plot.png \\
-T "PCA of ChIP-seq samples"
# Step 2: Coverage assessment
echo "Step 4: Assessing coverage..."
plotCoverage \\
--bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
--plotFile $OUTPUT_DIR/coverage.png \\
--ignoreDuplicates \\
--numberOfProcessors $THREADS
# Step 3: Fragment size (for paired-end data)
echo "Step 5: Analyzing fragment sizes..."
bamPEFragmentSize \\
--bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
--histogram $OUTPUT_DIR/fragmentSizes.png \\
--plotTitle "Fragment Size Distribution"
# Step 4: ChIP signal strength
echo "Step 6: Evaluating ChIP enrichment..."
plotFingerprint \\
--bamfiles $INPUT_BAM ${{CHIP_BAM[@]}} \\
--plotFile $OUTPUT_DIR/fingerprint.png \\
--extendReads 200 \\
--ignoreDuplicates \\
--numberOfProcessors $THREADS \\
--outQualityMetrics $OUTPUT_DIR/fingerprint_metrics.txt
echo "=== ChIP-seq QC workflow complete ==="
echo "Results are in: $OUTPUT_DIR"
"""
with open(output_file, 'w') as f:
f.write(script)
return f"✓ Generated ChIP-seq QC workflow: {output_file}"
def generate_chipseq_analysis_workflow(output_file, params):
"""Generate complete ChIP-seq analysis workflow script."""
script = f"""#!/bin/bash
# deepTools ChIP-seq Complete Analysis Workflow
# Generated by deepTools workflow generator
# Configuration
INPUT_BAM="{params.get('input_bam', 'Input.bam')}"
CHIP_BAM="{params.get('chip_bam', 'ChIP.bam')}"
GENES_BED="{params.get('genes_bed', 'genes.bed')}"
PEAKS_BED="{params.get('peaks_bed', 'peaks.bed')}"
GENOME_SIZE={params.get('genome_size', '2913022398')}
THREADS={params.get('threads', '8')}
OUTPUT_DIR="{params.get('output_dir', 'chipseq_analysis')}"
# Create output directory
mkdir -p $OUTPUT_DIR
echo "=== Starting ChIP-seq analysis workflow ==="
# Step 1: Generate normalized coverage tracks
echo "Step 1: Generating coverage tracks..."
bamCoverage \\
--bam $INPUT_BAM \\
--outFileName $OUTPUT_DIR/Input_coverage.bw \\
--normalizeUsing RPGC \\
--effectiveGenomeSize $GENOME_SIZE \\
--binSize 10 \\
--extendReads 200 \\
--ignoreDuplicates \\
--numberOfProcessors $THREADS
bamCoverage \\
--bam $CHIP_BAM \\
--outFileName $OUTPUT_DIR/ChIP_coverage.bw \\
--normalizeUsing RPGC \\
--effectiveGenomeSize $GENOME_SIZE \\
--binSize 10 \\
--extendReads 200 \\
--ignoreDuplicates \\
--numberOfProcessors $THREADS
# Step 2: Create log2 ratio track
echo "Step 2: Creating log2 ratio track..."
bamCompare \\
--bamfile1 $CHIP_BAM \\
--bamfile2 $INPUT_BAM \\
--outFileName $OUTPUT_DIR/ChIP_vs_Input_log2ratio.bw \\
--operation log2 \\
--scaleFactorsMethod readCount \\
--binSize 10 \\
--extendReads 200 \\
--ignoreDuplicates \\
--numberOfProcessors $THREADS
# Step 3: Compute matrix around TSS
echo "Step 3: Computing matrix around TSS..."
computeMatrix reference-point \\
--referencePoint TSS \\
--scoreFileName $OUTPUT_DIR/ChIP_coverage.bw \\
--regionsFileName $GENES_BED \\
--beforeRegionStartLength 3000 \\
--afterRegionStartLength 3000 \\
--binSize 10 \\
--sortRegions descend \\
--sortUsing mean \\
--outFileName $OUTPUT_DIR/matrix_TSS.gz \\
--numberOfProcessors $THREADS
# Step 4: Generate heatmap
echo "Step 4: Generating heatmap..."
plotHeatmap \\
--matrixFile $OUTPUT_DIR/matrix_TSS.gz \\
--outFileName $OUTPUT_DIR/heatmap_TSS.png \\
--colorMap RdBu \\
--whatToShow 'plot, heatmap and colorbar' \\
--yAxisLabel "Genes" \\
--xAxisLabel "Distance from TSS (bp)" \\
--refPointLabel "TSS" \\
--heatmapHeight 15 \\
--kmeans 3
# Step 5: Generate profile plot
echo "Step 5: Generating profile plot..."
plotProfile \\
--matrixFile $OUTPUT_DIR/matrix_TSS.gz \\
--outFileName $OUTPUT_DIR/profile_TSS.png \\
--plotType lines \\
--perGroup \\
--colors blue \\
--plotTitle "ChIP-seq signal around TSS" \\
--yAxisLabel "Average signal" \\
--refPointLabel "TSS"
# Step 6: Enrichment at peaks (if peaks provided)
if [ -f "$PEAKS_BED" ]; then
echo "Step 6: Calculating enrichment at peaks..."
plotEnrichment \\
--bamfiles $INPUT_BAM $CHIP_BAM \\
--BED $PEAKS_BED \\
--labels Input ChIP \\
--plotFile $OUTPUT_DIR/enrichment.png \\
--outRawCounts $OUTPUT_DIR/enrichment_counts.tab \\
--extendReads 200 \\
--ignoreDuplicates
fi
echo "=== ChIP-seq analysis complete ==="
echo "Results are in: $OUTPUT_DIR"
"""
with open(output_file, 'w') as f:
f.write(script)
return f"✓ Generated ChIP-seq analysis workflow: {output_file}"
def generate_rnaseq_coverage_workflow(output_file, params):
"""Generate RNA-seq coverage workflow script."""
script = f"""#!/bin/bash
# deepTools RNA-seq Coverage Workflow
# Generated by deepTools workflow generator
# Configuration
RNASEQ_BAM="{params.get('rnaseq_bam', 'rnaseq.bam')}"
THREADS={params.get('threads', '8')}
OUTPUT_DIR="{params.get('output_dir', 'rnaseq_coverage')}"
# Create output directory
mkdir -p $OUTPUT_DIR
echo "=== Starting RNA-seq coverage workflow ==="
# Generate strand-specific coverage tracks
echo "Step 1: Generating forward strand coverage..."
bamCoverage \\
--bam $RNASEQ_BAM \\
--outFileName $OUTPUT_DIR/forward_coverage.bw \\
--filterRNAstrand forward \\
--normalizeUsing CPM \\
--binSize 1 \\
--numberOfProcessors $THREADS
echo "Step 2: Generating reverse strand coverage..."
bamCoverage \\
--bam $RNASEQ_BAM \\
--outFileName $OUTPUT_DIR/reverse_coverage.bw \\
--filterRNAstrand reverse \\
--normalizeUsing CPM \\
--binSize 1 \\
--numberOfProcessors $THREADS
echo "=== RNA-seq coverage workflow complete ==="
echo "Results are in: $OUTPUT_DIR"
echo ""
echo "Note: These bigWig files can be loaded into genome browsers"
echo "for strand-specific visualization of RNA-seq data."
"""
with open(output_file, 'w') as f:
f.write(script)
return f"✓ Generated RNA-seq coverage workflow: {output_file}"
def generate_atacseq_workflow(output_file, params):
"""Generate ATAC-seq workflow script."""
script = f"""#!/bin/bash
# deepTools ATAC-seq Analysis Workflow
# Generated by deepTools workflow generator
# Configuration
ATAC_BAM="{params.get('atac_bam', 'atacseq.bam')}"
PEAKS_BED="{params.get('peaks_bed', 'peaks.bed')}"
GENOME_SIZE={params.get('genome_size', '2913022398')}
THREADS={params.get('threads', '8')}
OUTPUT_DIR="{params.get('output_dir', 'atacseq_analysis')}"
# Create output directory
mkdir -p $OUTPUT_DIR
echo "=== Starting ATAC-seq analysis workflow ==="
# Step 1: Shift reads for Tn5 correction
echo "Step 1: Applying Tn5 offset correction..."
alignmentSieve \\
--bam $ATAC_BAM \\
--outFile $OUTPUT_DIR/atacseq_shifted.bam \\
--ATACshift \\
--minFragmentLength 38 \\
--maxFragmentLength 2000 \\
--ignoreDuplicates
# Index the shifted BAM
samtools index $OUTPUT_DIR/atacseq_shifted.bam
# Step 2: Generate coverage track
echo "Step 2: Generating coverage track..."
bamCoverage \\
--bam $OUTPUT_DIR/atacseq_shifted.bam \\
--outFileName $OUTPUT_DIR/atacseq_coverage.bw \\
--normalizeUsing RPGC \\
--effectiveGenomeSize $GENOME_SIZE \\
--binSize 1 \\
--numberOfProcessors $THREADS
# Step 3: Fragment size analysis
echo "Step 3: Analyzing fragment sizes..."
bamPEFragmentSize \\
--bamfiles $ATAC_BAM \\
--histogram $OUTPUT_DIR/fragmentSizes.png \\
--maxFragmentLength 1000
# Step 4: Compute matrix at peaks (if peaks provided)
if [ -f "$PEAKS_BED" ]; then
echo "Step 4: Computing matrix at peaks..."
computeMatrix reference-point \\
--referencePoint center \\
--scoreFileName $OUTPUT_DIR/atacseq_coverage.bw \\
--regionsFileName $PEAKS_BED \\
--beforeRegionStartLength 2000 \\
--afterRegionStartLength 2000 \\
--binSize 10 \\
--outFileName $OUTPUT_DIR/matrix_peaks.gz \\
--numberOfProcessors $THREADS
echo "Step 5: Generating heatmap..."
plotHeatmap \\
--matrixFile $OUTPUT_DIR/matrix_peaks.gz \\
--outFileName $OUTPUT_DIR/heatmap_peaks.png \\
--colorMap YlOrRd \\
--refPointLabel "Peak Center" \\
--heatmapHeight 15
fi
echo "=== ATAC-seq analysis complete ==="
echo "Results are in: $OUTPUT_DIR"
echo ""
echo "Expected fragment size pattern:"
echo " ~50bp: nucleosome-free regions"
echo " ~200bp: mono-nucleosome"
echo " ~400bp: di-nucleosome"
"""
with open(output_file, 'w') as f:
f.write(script)
return f"✓ Generated ATAC-seq workflow: {output_file}"
def main():
parser = argparse.ArgumentParser(
description="Generate deepTools workflow scripts",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=f"""
Available workflows:
{chr(10).join(f" {key}: {value['name']}" for key, value in WORKFLOWS.items())}
Examples:
# Generate ChIP-seq QC workflow
python workflow_generator.py chipseq_qc -o chipseq_qc.sh
# Generate ChIP-seq analysis with custom parameters
python workflow_generator.py chipseq_analysis -o analysis.sh \\
--chip-bam H3K4me3.bam --input-bam Input.bam
# List all available workflows
python workflow_generator.py --list
"""
)
parser.add_argument('workflow', nargs='?', choices=list(WORKFLOWS.keys()),
help='Workflow type to generate')
parser.add_argument('-o', '--output', default='deeptools_workflow.sh',
help='Output script filename (default: deeptools_workflow.sh)')
parser.add_argument('--list', action='store_true',
help='List all available workflows')
# Common parameters
parser.add_argument('--threads', type=int, default=8,
help='Number of threads (default: 8)')
parser.add_argument('--genome-size', type=int, default=2913022398,
help='Effective genome size (default: 2913022398 for hg38)')
parser.add_argument('--output-dir', default=None,
help='Output directory for results')
# Workflow-specific parameters
parser.add_argument('--input-bam', help='Input/control BAM file')
parser.add_argument('--chip-bam', help='ChIP BAM file')
parser.add_argument('--chip-bams', help='Multiple ChIP BAM files (space-separated)')
parser.add_argument('--rnaseq-bam', help='RNA-seq BAM file')
parser.add_argument('--atac-bam', help='ATAC-seq BAM file')
parser.add_argument('--genes-bed', help='Genes BED file')
parser.add_argument('--peaks-bed', help='Peaks BED file')
args = parser.parse_args()
# List workflows
if args.list:
print("\nAvailable deepTools workflows:\n")
for key, value in WORKFLOWS.items():
print(f" {key}")
print(f" {value['name']}")
print(f" {value['description']}\n")
sys.exit(0)
# Check if workflow was specified
if not args.workflow:
parser.print_help()
sys.exit(1)
# Prepare parameters
params = {
'threads': args.threads,
'genome_size': args.genome_size,
'output_dir': args.output_dir or f"{args.workflow}_output",
'input_bam': args.input_bam,
'chip_bam': args.chip_bam,
'chip_bams': args.chip_bams,
'rnaseq_bam': args.rnaseq_bam,
'atac_bam': args.atac_bam,
'genes_bed': args.genes_bed,
'peaks_bed': args.peaks_bed,
}
# Generate workflow
if args.workflow == 'chipseq_qc':
message = generate_chipseq_qc_workflow(args.output, params)
elif args.workflow == 'chipseq_analysis':
message = generate_chipseq_analysis_workflow(args.output, params)
elif args.workflow == 'rnaseq_coverage':
message = generate_rnaseq_coverage_workflow(args.output, params)
elif args.workflow == 'atacseq':
message = generate_atacseq_workflow(args.output, params)
print(message)
print(f"\nTo run the workflow:")
print(f" chmod +x {args.output}")
print(f" ./{args.output}")
print(f"\nNote: Edit the script to customize file paths and parameters.")
if __name__ == "__main__":
main()