Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,520 @@
# Common Bioinformatics Workflows with Pysam
## Overview
This document provides practical examples of common bioinformatics workflows using pysam, demonstrating how to combine different file types and operations.
## Quality Control Workflows
### Calculate BAM Statistics
```python
import pysam
def calculate_bam_stats(bam_file):
"""Calculate basic statistics for BAM file."""
samfile = pysam.AlignmentFile(bam_file, "rb")
stats = {
"total_reads": 0,
"mapped_reads": 0,
"unmapped_reads": 0,
"paired_reads": 0,
"proper_pairs": 0,
"duplicates": 0,
"total_bases": 0,
"mapped_bases": 0
}
for read in samfile.fetch(until_eof=True):
stats["total_reads"] += 1
if read.is_unmapped:
stats["unmapped_reads"] += 1
else:
stats["mapped_reads"] += 1
stats["mapped_bases"] += read.query_alignment_length
if read.is_paired:
stats["paired_reads"] += 1
if read.is_proper_pair:
stats["proper_pairs"] += 1
if read.is_duplicate:
stats["duplicates"] += 1
stats["total_bases"] += read.query_length
samfile.close()
# Calculate derived statistics
stats["mapping_rate"] = stats["mapped_reads"] / stats["total_reads"] if stats["total_reads"] > 0 else 0
stats["duplication_rate"] = stats["duplicates"] / stats["total_reads"] if stats["total_reads"] > 0 else 0
return stats
```
### Check Reference Consistency
```python
def check_bam_reference_consistency(bam_file, fasta_file):
"""Verify that BAM reads match reference genome."""
samfile = pysam.AlignmentFile(bam_file, "rb")
fasta = pysam.FastaFile(fasta_file)
mismatches = 0
total_checked = 0
for read in samfile.fetch():
if read.is_unmapped:
continue
# Get reference sequence for aligned region
ref_seq = fasta.fetch(
read.reference_name,
read.reference_start,
read.reference_end
)
# Get read sequence aligned to reference
aligned_pairs = read.get_aligned_pairs(with_seq=True)
for query_pos, ref_pos, ref_base in aligned_pairs:
if query_pos is not None and ref_pos is not None and ref_base is not None:
read_base = read.query_sequence[query_pos]
if read_base.upper() != ref_base.upper():
mismatches += 1
total_checked += 1
if total_checked >= 10000: # Sample first 10k positions
break
samfile.close()
fasta.close()
error_rate = mismatches / total_checked if total_checked > 0 else 0
return {
"positions_checked": total_checked,
"mismatches": mismatches,
"error_rate": error_rate
}
```
## Coverage Analysis
### Calculate Per-Base Coverage
```python
def calculate_coverage(bam_file, chrom, start, end):
"""Calculate coverage for each position in region."""
samfile = pysam.AlignmentFile(bam_file, "rb")
# Initialize coverage array
length = end - start
coverage = [0] * length
# Count coverage at each position
for pileupcolumn in samfile.pileup(chrom, start, end):
if start <= pileupcolumn.pos < end:
coverage[pileupcolumn.pos - start] = pileupcolumn.nsegments
samfile.close()
return coverage
```
### Identify Low Coverage Regions
```python
def find_low_coverage_regions(bam_file, chrom, start, end, min_coverage=10):
"""Find regions with coverage below threshold."""
samfile = pysam.AlignmentFile(bam_file, "rb")
low_coverage_regions = []
in_low_region = False
region_start = None
for pileupcolumn in samfile.pileup(chrom, start, end):
pos = pileupcolumn.pos
if pos < start or pos >= end:
continue
coverage = pileupcolumn.nsegments
if coverage < min_coverage:
if not in_low_region:
region_start = pos
in_low_region = True
else:
if in_low_region:
low_coverage_regions.append((region_start, pos))
in_low_region = False
# Close last region if still open
if in_low_region:
low_coverage_regions.append((region_start, end))
samfile.close()
return low_coverage_regions
```
### Calculate Coverage Statistics
```python
def coverage_statistics(bam_file, chrom, start, end):
"""Calculate coverage statistics for region."""
samfile = pysam.AlignmentFile(bam_file, "rb")
coverages = []
for pileupcolumn in samfile.pileup(chrom, start, end):
if start <= pileupcolumn.pos < end:
coverages.append(pileupcolumn.nsegments)
samfile.close()
if not coverages:
return None
coverages.sort()
n = len(coverages)
return {
"mean": sum(coverages) / n,
"median": coverages[n // 2],
"min": coverages[0],
"max": coverages[-1],
"positions": n
}
```
## Variant Analysis
### Extract Variants in Regions
```python
def extract_variants_in_genes(vcf_file, bed_file):
"""Extract variants overlapping gene regions."""
vcf = pysam.VariantFile(vcf_file)
bed = pysam.TabixFile(bed_file)
variants_by_gene = {}
for gene in bed.fetch(parser=pysam.asBed()):
gene_name = gene.name
variants_by_gene[gene_name] = []
# Find variants in gene region
for variant in vcf.fetch(gene.contig, gene.start, gene.end):
variant_info = {
"chrom": variant.chrom,
"pos": variant.pos,
"ref": variant.ref,
"alt": variant.alts,
"qual": variant.qual
}
variants_by_gene[gene_name].append(variant_info)
vcf.close()
bed.close()
return variants_by_gene
```
### Annotate Variants with Coverage
```python
def annotate_variants_with_coverage(vcf_file, bam_file, output_file):
"""Add coverage information to variants."""
vcf = pysam.VariantFile(vcf_file)
samfile = pysam.AlignmentFile(bam_file, "rb")
# Add DP to header if not present
if "DP" not in vcf.header.info:
vcf.header.info.add("DP", "1", "Integer", "Total Depth from BAM")
outvcf = pysam.VariantFile(output_file, "w", header=vcf.header)
for variant in vcf:
# Get coverage at variant position
coverage = samfile.count(
variant.chrom,
variant.pos - 1, # Convert to 0-based
variant.pos
)
# Add to INFO field
variant.info["DP"] = coverage
outvcf.write(variant)
vcf.close()
samfile.close()
outvcf.close()
```
### Filter Variants by Read Support
```python
def filter_variants_by_support(vcf_file, bam_file, output_file, min_alt_reads=3):
"""Filter variants requiring minimum alternate allele support."""
vcf = pysam.VariantFile(vcf_file)
samfile = pysam.AlignmentFile(bam_file, "rb")
outvcf = pysam.VariantFile(output_file, "w", header=vcf.header)
for variant in vcf:
# Count reads supporting each allele
allele_counts = {variant.ref: 0}
for alt in variant.alts:
allele_counts[alt] = 0
# Pileup at variant position
for pileupcolumn in samfile.pileup(
variant.chrom,
variant.pos - 1,
variant.pos
):
if pileupcolumn.pos == variant.pos - 1: # 0-based
for pileupread in pileupcolumn.pileups:
if not pileupread.is_del and not pileupread.is_refskip:
base = pileupread.alignment.query_sequence[
pileupread.query_position
]
if base in allele_counts:
allele_counts[base] += 1
# Check if any alt allele has sufficient support
has_support = any(
allele_counts.get(alt, 0) >= min_alt_reads
for alt in variant.alts
)
if has_support:
outvcf.write(variant)
vcf.close()
samfile.close()
outvcf.close()
```
## Sequence Extraction
### Extract Sequences Around Variants
```python
def extract_variant_contexts(vcf_file, fasta_file, output_file, window=50):
"""Extract reference sequences around variants."""
vcf = pysam.VariantFile(vcf_file)
fasta = pysam.FastaFile(fasta_file)
with open(output_file, 'w') as out:
for variant in vcf:
# Get sequence context
start = max(0, variant.pos - window - 1) # Convert to 0-based
end = variant.pos + window
context = fasta.fetch(variant.chrom, start, end)
# Mark variant position
var_pos_in_context = variant.pos - 1 - start
out.write(f">{variant.chrom}:{variant.pos} {variant.ref}>{variant.alts}\n")
out.write(context[:var_pos_in_context].lower())
out.write(context[var_pos_in_context:var_pos_in_context+len(variant.ref)].upper())
out.write(context[var_pos_in_context+len(variant.ref):].lower())
out.write("\n")
vcf.close()
fasta.close()
```
### Extract Gene Sequences
```python
def extract_gene_sequences(bed_file, fasta_file, output_fasta):
"""Extract sequences for genes from BED file."""
bed = pysam.TabixFile(bed_file)
fasta = pysam.FastaFile(fasta_file)
with open(output_fasta, 'w') as out:
for gene in bed.fetch(parser=pysam.asBed()):
sequence = fasta.fetch(gene.contig, gene.start, gene.end)
# Handle strand
if hasattr(gene, 'strand') and gene.strand == '-':
# Reverse complement
complement = str.maketrans("ATGCatgcNn", "TACGtacgNn")
sequence = sequence.translate(complement)[::-1]
out.write(f">{gene.name} {gene.contig}:{gene.start}-{gene.end}\n")
# Write sequence in 60-character lines
for i in range(0, len(sequence), 60):
out.write(sequence[i:i+60] + "\n")
bed.close()
fasta.close()
```
## Read Filtering and Subsetting
### Filter BAM by Region and Quality
```python
def filter_bam(input_bam, output_bam, chrom, start, end, min_mapq=20):
"""Filter BAM file by region and mapping quality."""
infile = pysam.AlignmentFile(input_bam, "rb")
outfile = pysam.AlignmentFile(output_bam, "wb", template=infile)
for read in infile.fetch(chrom, start, end):
if read.mapping_quality >= min_mapq and not read.is_duplicate:
outfile.write(read)
infile.close()
outfile.close()
# Create index
pysam.index(output_bam)
```
### Extract Reads for Specific Variants
```python
def extract_reads_at_variants(bam_file, vcf_file, output_bam, window=100):
"""Extract reads overlapping variant positions."""
samfile = pysam.AlignmentFile(bam_file, "rb")
vcf = pysam.VariantFile(vcf_file)
outfile = pysam.AlignmentFile(output_bam, "wb", template=samfile)
# Collect all reads (using set to avoid duplicates)
reads_to_keep = set()
for variant in vcf:
start = max(0, variant.pos - window - 1)
end = variant.pos + window
for read in samfile.fetch(variant.chrom, start, end):
reads_to_keep.add(read.query_name)
# Write all reads
samfile.close()
samfile = pysam.AlignmentFile(bam_file, "rb")
for read in samfile.fetch(until_eof=True):
if read.query_name in reads_to_keep:
outfile.write(read)
samfile.close()
vcf.close()
outfile.close()
pysam.index(output_bam)
```
## Integration Workflows
### Create Coverage Track from BAM
```python
def create_coverage_bedgraph(bam_file, output_file, chrom=None):
"""Create bedGraph coverage track from BAM."""
samfile = pysam.AlignmentFile(bam_file, "rb")
chroms = [chrom] if chrom else samfile.references
with open(output_file, 'w') as out:
out.write("track type=bedGraph name=\"Coverage\"\n")
for chrom in chroms:
current_cov = None
region_start = None
for pileupcolumn in samfile.pileup(chrom):
pos = pileupcolumn.pos
cov = pileupcolumn.nsegments
if cov != current_cov:
# Write previous region
if current_cov is not None:
out.write(f"{chrom}\t{region_start}\t{pos}\t{current_cov}\n")
# Start new region
current_cov = cov
region_start = pos
# Write final region
if current_cov is not None:
out.write(f"{chrom}\t{region_start}\t{pos+1}\t{current_cov}\n")
samfile.close()
```
### Merge Multiple VCF Files
```python
def merge_vcf_samples(vcf_files, output_file):
"""Merge multiple single-sample VCFs."""
# Open all input files
vcf_readers = [pysam.VariantFile(f) for f in vcf_files]
# Create merged header
merged_header = vcf_readers[0].header.copy()
for vcf in vcf_readers[1:]:
for sample in vcf.header.samples:
merged_header.samples.add(sample)
outvcf = pysam.VariantFile(output_file, "w", header=merged_header)
# Get all variant positions
all_variants = {}
for vcf in vcf_readers:
for variant in vcf:
key = (variant.chrom, variant.pos, variant.ref, variant.alts)
if key not in all_variants:
all_variants[key] = []
all_variants[key].append(variant)
# Write merged variants
for key, variants in sorted(all_variants.items()):
# Create merged record from first variant
merged = outvcf.new_record(
contig=variants[0].chrom,
start=variants[0].start,
stop=variants[0].stop,
alleles=variants[0].alleles
)
# Add genotypes from all samples
for variant in variants:
for sample in variant.samples:
merged.samples[sample].update(variant.samples[sample])
outvcf.write(merged)
# Close all files
for vcf in vcf_readers:
vcf.close()
outvcf.close()
```
## Performance Tips for Workflows
1. **Use indexed files** for all random access operations
2. **Process regions in parallel** when analyzing multiple independent regions
3. **Stream data when possible** - avoid loading entire files into memory
4. **Close files explicitly** to free resources
5. **Use `until_eof=True`** for sequential processing of entire files
6. **Batch operations** on the same file to minimize I/O
7. **Consider memory usage** with pileup operations on high-coverage regions
8. **Use count() instead of pileup()** when only counts are needed
## Common Integration Patterns
1. **BAM + Reference**: Verify alignments, extract aligned sequences
2. **BAM + VCF**: Validate variants, calculate allele frequencies
3. **VCF + BED**: Annotate variants with gene/region information
4. **BAM + BED**: Calculate coverage statistics for specific regions
5. **FASTA + VCF**: Extract variant context sequences
6. **Multiple BAMs**: Compare coverage or variants across samples
7. **BAM + FASTQ**: Extract unaligned reads for re-alignment