Files
gh-k-dense-ai-claude-scient…/skills/deeptools/scripts/validate_files.py
2025-11-30 08:30:10 +08:00

196 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
deepTools File Validation Script
Validates BAM, bigWig, and BED files for deepTools analysis.
Checks for file existence, proper indexing, and basic format requirements.
"""
import os
import sys
import argparse
from pathlib import Path
def check_file_exists(filepath):
"""Check if file exists and is readable."""
if not os.path.exists(filepath):
return False, f"File not found: {filepath}"
if not os.access(filepath, os.R_OK):
return False, f"File not readable: {filepath}"
return True, f"✓ File exists: {filepath}"
def check_bam_index(bam_file):
"""Check if BAM file has an index (.bai or .bam.bai)."""
bai_file1 = bam_file + ".bai"
bai_file2 = bam_file.replace(".bam", ".bai")
if os.path.exists(bai_file1):
return True, f"✓ BAM index found: {bai_file1}"
elif os.path.exists(bai_file2):
return True, f"✓ BAM index found: {bai_file2}"
else:
return False, f"✗ BAM index missing for: {bam_file}\n Run: samtools index {bam_file}"
def check_bigwig_file(bw_file):
"""Basic check for bigWig file."""
# Check file size (bigWig files should have reasonable size)
file_size = os.path.getsize(bw_file)
if file_size < 100:
return False, f"✗ bigWig file suspiciously small: {bw_file} ({file_size} bytes)"
return True, f"✓ bigWig file appears valid: {bw_file} ({file_size} bytes)"
def check_bed_file(bed_file):
"""Basic validation of BED file format."""
try:
with open(bed_file, 'r') as f:
lines = [line.strip() for line in f if line.strip() and not line.startswith('#')]
if len(lines) == 0:
return False, f"✗ BED file is empty: {bed_file}"
# Check first few lines for basic format
for i, line in enumerate(lines[:10], 1):
fields = line.split('\t')
if len(fields) < 3:
return False, f"✗ BED file format error at line {i}: expected at least 3 columns\n Line: {line}"
# Check if start and end are integers
try:
start = int(fields[1])
end = int(fields[2])
if start >= end:
return False, f"✗ BED file error at line {i}: start >= end ({start} >= {end})"
except ValueError:
return False, f"✗ BED file format error at line {i}: start and end must be integers\n Line: {line}"
return True, f"✓ BED file format appears valid: {bed_file} ({len(lines)} regions)"
except Exception as e:
return False, f"✗ Error reading BED file: {bed_file}\n Error: {str(e)}"
def validate_files(bam_files=None, bigwig_files=None, bed_files=None):
"""
Validate all provided files.
Args:
bam_files: List of BAM file paths
bigwig_files: List of bigWig file paths
bed_files: List of BED file paths
Returns:
Tuple of (success: bool, messages: list)
"""
all_success = True
messages = []
# Validate BAM files
if bam_files:
messages.append("\n=== Validating BAM Files ===")
for bam_file in bam_files:
# Check existence
success, msg = check_file_exists(bam_file)
messages.append(msg)
if not success:
all_success = False
continue
# Check index
success, msg = check_bam_index(bam_file)
messages.append(msg)
if not success:
all_success = False
# Validate bigWig files
if bigwig_files:
messages.append("\n=== Validating bigWig Files ===")
for bw_file in bigwig_files:
# Check existence
success, msg = check_file_exists(bw_file)
messages.append(msg)
if not success:
all_success = False
continue
# Basic bigWig check
success, msg = check_bigwig_file(bw_file)
messages.append(msg)
if not success:
all_success = False
# Validate BED files
if bed_files:
messages.append("\n=== Validating BED Files ===")
for bed_file in bed_files:
# Check existence
success, msg = check_file_exists(bed_file)
messages.append(msg)
if not success:
all_success = False
continue
# Check BED format
success, msg = check_bed_file(bed_file)
messages.append(msg)
if not success:
all_success = False
return all_success, messages
def main():
parser = argparse.ArgumentParser(
description="Validate files for deepTools analysis",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Validate BAM files
python validate_files.py --bam sample1.bam sample2.bam
# Validate all file types
python validate_files.py --bam input.bam chip.bam --bed peaks.bed --bigwig signal.bw
# Validate from a directory
python validate_files.py --bam *.bam --bed *.bed
"""
)
parser.add_argument('--bam', nargs='+', help='BAM files to validate')
parser.add_argument('--bigwig', '--bw', nargs='+', help='bigWig files to validate')
parser.add_argument('--bed', nargs='+', help='BED files to validate')
args = parser.parse_args()
# Check if any files were provided
if not any([args.bam, args.bigwig, args.bed]):
parser.print_help()
sys.exit(1)
# Run validation
success, messages = validate_files(
bam_files=args.bam,
bigwig_files=args.bigwig,
bed_files=args.bed
)
# Print results
for msg in messages:
print(msg)
# Summary
print("\n" + "="*50)
if success:
print("✓ All validations passed!")
sys.exit(0)
else:
print("✗ Some validations failed. Please fix the issues above.")
sys.exit(1)
if __name__ == "__main__":
main()