521 lines
15 KiB
Markdown
521 lines
15 KiB
Markdown
# Common Bioinformatics Workflows with Pysam
|
|
|
|
## Overview
|
|
|
|
This document provides practical examples of common bioinformatics workflows using pysam, demonstrating how to combine different file types and operations.
|
|
|
|
## Quality Control Workflows
|
|
|
|
### Calculate BAM Statistics
|
|
|
|
```python
|
|
import pysam
|
|
|
|
def calculate_bam_stats(bam_file):
|
|
"""Calculate basic statistics for BAM file."""
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
|
|
stats = {
|
|
"total_reads": 0,
|
|
"mapped_reads": 0,
|
|
"unmapped_reads": 0,
|
|
"paired_reads": 0,
|
|
"proper_pairs": 0,
|
|
"duplicates": 0,
|
|
"total_bases": 0,
|
|
"mapped_bases": 0
|
|
}
|
|
|
|
for read in samfile.fetch(until_eof=True):
|
|
stats["total_reads"] += 1
|
|
|
|
if read.is_unmapped:
|
|
stats["unmapped_reads"] += 1
|
|
else:
|
|
stats["mapped_reads"] += 1
|
|
stats["mapped_bases"] += read.query_alignment_length
|
|
|
|
if read.is_paired:
|
|
stats["paired_reads"] += 1
|
|
if read.is_proper_pair:
|
|
stats["proper_pairs"] += 1
|
|
|
|
if read.is_duplicate:
|
|
stats["duplicates"] += 1
|
|
|
|
stats["total_bases"] += read.query_length
|
|
|
|
samfile.close()
|
|
|
|
# Calculate derived statistics
|
|
stats["mapping_rate"] = stats["mapped_reads"] / stats["total_reads"] if stats["total_reads"] > 0 else 0
|
|
stats["duplication_rate"] = stats["duplicates"] / stats["total_reads"] if stats["total_reads"] > 0 else 0
|
|
|
|
return stats
|
|
```
|
|
|
|
### Check Reference Consistency
|
|
|
|
```python
|
|
def check_bam_reference_consistency(bam_file, fasta_file):
|
|
"""Verify that BAM reads match reference genome."""
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
fasta = pysam.FastaFile(fasta_file)
|
|
|
|
mismatches = 0
|
|
total_checked = 0
|
|
|
|
for read in samfile.fetch():
|
|
if read.is_unmapped:
|
|
continue
|
|
|
|
# Get reference sequence for aligned region
|
|
ref_seq = fasta.fetch(
|
|
read.reference_name,
|
|
read.reference_start,
|
|
read.reference_end
|
|
)
|
|
|
|
# Get read sequence aligned to reference
|
|
aligned_pairs = read.get_aligned_pairs(with_seq=True)
|
|
|
|
for query_pos, ref_pos, ref_base in aligned_pairs:
|
|
if query_pos is not None and ref_pos is not None and ref_base is not None:
|
|
read_base = read.query_sequence[query_pos]
|
|
if read_base.upper() != ref_base.upper():
|
|
mismatches += 1
|
|
total_checked += 1
|
|
|
|
if total_checked >= 10000: # Sample first 10k positions
|
|
break
|
|
|
|
samfile.close()
|
|
fasta.close()
|
|
|
|
error_rate = mismatches / total_checked if total_checked > 0 else 0
|
|
return {
|
|
"positions_checked": total_checked,
|
|
"mismatches": mismatches,
|
|
"error_rate": error_rate
|
|
}
|
|
```
|
|
|
|
## Coverage Analysis
|
|
|
|
### Calculate Per-Base Coverage
|
|
|
|
```python
|
|
def calculate_coverage(bam_file, chrom, start, end):
|
|
"""Calculate coverage for each position in region."""
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
|
|
# Initialize coverage array
|
|
length = end - start
|
|
coverage = [0] * length
|
|
|
|
# Count coverage at each position
|
|
for pileupcolumn in samfile.pileup(chrom, start, end):
|
|
if start <= pileupcolumn.pos < end:
|
|
coverage[pileupcolumn.pos - start] = pileupcolumn.nsegments
|
|
|
|
samfile.close()
|
|
|
|
return coverage
|
|
```
|
|
|
|
### Identify Low Coverage Regions
|
|
|
|
```python
|
|
def find_low_coverage_regions(bam_file, chrom, start, end, min_coverage=10):
|
|
"""Find regions with coverage below threshold."""
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
|
|
low_coverage_regions = []
|
|
in_low_region = False
|
|
region_start = None
|
|
|
|
for pileupcolumn in samfile.pileup(chrom, start, end):
|
|
pos = pileupcolumn.pos
|
|
if pos < start or pos >= end:
|
|
continue
|
|
|
|
coverage = pileupcolumn.nsegments
|
|
|
|
if coverage < min_coverage:
|
|
if not in_low_region:
|
|
region_start = pos
|
|
in_low_region = True
|
|
else:
|
|
if in_low_region:
|
|
low_coverage_regions.append((region_start, pos))
|
|
in_low_region = False
|
|
|
|
# Close last region if still open
|
|
if in_low_region:
|
|
low_coverage_regions.append((region_start, end))
|
|
|
|
samfile.close()
|
|
|
|
return low_coverage_regions
|
|
```
|
|
|
|
### Calculate Coverage Statistics
|
|
|
|
```python
|
|
def coverage_statistics(bam_file, chrom, start, end):
|
|
"""Calculate coverage statistics for region."""
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
|
|
coverages = []
|
|
|
|
for pileupcolumn in samfile.pileup(chrom, start, end):
|
|
if start <= pileupcolumn.pos < end:
|
|
coverages.append(pileupcolumn.nsegments)
|
|
|
|
samfile.close()
|
|
|
|
if not coverages:
|
|
return None
|
|
|
|
coverages.sort()
|
|
n = len(coverages)
|
|
|
|
return {
|
|
"mean": sum(coverages) / n,
|
|
"median": coverages[n // 2],
|
|
"min": coverages[0],
|
|
"max": coverages[-1],
|
|
"positions": n
|
|
}
|
|
```
|
|
|
|
## Variant Analysis
|
|
|
|
### Extract Variants in Regions
|
|
|
|
```python
|
|
def extract_variants_in_genes(vcf_file, bed_file):
|
|
"""Extract variants overlapping gene regions."""
|
|
vcf = pysam.VariantFile(vcf_file)
|
|
bed = pysam.TabixFile(bed_file)
|
|
|
|
variants_by_gene = {}
|
|
|
|
for gene in bed.fetch(parser=pysam.asBed()):
|
|
gene_name = gene.name
|
|
variants_by_gene[gene_name] = []
|
|
|
|
# Find variants in gene region
|
|
for variant in vcf.fetch(gene.contig, gene.start, gene.end):
|
|
variant_info = {
|
|
"chrom": variant.chrom,
|
|
"pos": variant.pos,
|
|
"ref": variant.ref,
|
|
"alt": variant.alts,
|
|
"qual": variant.qual
|
|
}
|
|
variants_by_gene[gene_name].append(variant_info)
|
|
|
|
vcf.close()
|
|
bed.close()
|
|
|
|
return variants_by_gene
|
|
```
|
|
|
|
### Annotate Variants with Coverage
|
|
|
|
```python
|
|
def annotate_variants_with_coverage(vcf_file, bam_file, output_file):
|
|
"""Add coverage information to variants."""
|
|
vcf = pysam.VariantFile(vcf_file)
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
|
|
# Add DP to header if not present
|
|
if "DP" not in vcf.header.info:
|
|
vcf.header.info.add("DP", "1", "Integer", "Total Depth from BAM")
|
|
|
|
outvcf = pysam.VariantFile(output_file, "w", header=vcf.header)
|
|
|
|
for variant in vcf:
|
|
# Get coverage at variant position
|
|
coverage = samfile.count(
|
|
variant.chrom,
|
|
variant.pos - 1, # Convert to 0-based
|
|
variant.pos
|
|
)
|
|
|
|
# Add to INFO field
|
|
variant.info["DP"] = coverage
|
|
|
|
outvcf.write(variant)
|
|
|
|
vcf.close()
|
|
samfile.close()
|
|
outvcf.close()
|
|
```
|
|
|
|
### Filter Variants by Read Support
|
|
|
|
```python
|
|
def filter_variants_by_support(vcf_file, bam_file, output_file, min_alt_reads=3):
|
|
"""Filter variants requiring minimum alternate allele support."""
|
|
vcf = pysam.VariantFile(vcf_file)
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
outvcf = pysam.VariantFile(output_file, "w", header=vcf.header)
|
|
|
|
for variant in vcf:
|
|
# Count reads supporting each allele
|
|
allele_counts = {variant.ref: 0}
|
|
for alt in variant.alts:
|
|
allele_counts[alt] = 0
|
|
|
|
# Pileup at variant position
|
|
for pileupcolumn in samfile.pileup(
|
|
variant.chrom,
|
|
variant.pos - 1,
|
|
variant.pos
|
|
):
|
|
if pileupcolumn.pos == variant.pos - 1: # 0-based
|
|
for pileupread in pileupcolumn.pileups:
|
|
if not pileupread.is_del and not pileupread.is_refskip:
|
|
base = pileupread.alignment.query_sequence[
|
|
pileupread.query_position
|
|
]
|
|
if base in allele_counts:
|
|
allele_counts[base] += 1
|
|
|
|
# Check if any alt allele has sufficient support
|
|
has_support = any(
|
|
allele_counts.get(alt, 0) >= min_alt_reads
|
|
for alt in variant.alts
|
|
)
|
|
|
|
if has_support:
|
|
outvcf.write(variant)
|
|
|
|
vcf.close()
|
|
samfile.close()
|
|
outvcf.close()
|
|
```
|
|
|
|
## Sequence Extraction
|
|
|
|
### Extract Sequences Around Variants
|
|
|
|
```python
|
|
def extract_variant_contexts(vcf_file, fasta_file, output_file, window=50):
|
|
"""Extract reference sequences around variants."""
|
|
vcf = pysam.VariantFile(vcf_file)
|
|
fasta = pysam.FastaFile(fasta_file)
|
|
|
|
with open(output_file, 'w') as out:
|
|
for variant in vcf:
|
|
# Get sequence context
|
|
start = max(0, variant.pos - window - 1) # Convert to 0-based
|
|
end = variant.pos + window
|
|
|
|
context = fasta.fetch(variant.chrom, start, end)
|
|
|
|
# Mark variant position
|
|
var_pos_in_context = variant.pos - 1 - start
|
|
|
|
out.write(f">{variant.chrom}:{variant.pos} {variant.ref}>{variant.alts}\n")
|
|
out.write(context[:var_pos_in_context].lower())
|
|
out.write(context[var_pos_in_context:var_pos_in_context+len(variant.ref)].upper())
|
|
out.write(context[var_pos_in_context+len(variant.ref):].lower())
|
|
out.write("\n")
|
|
|
|
vcf.close()
|
|
fasta.close()
|
|
```
|
|
|
|
### Extract Gene Sequences
|
|
|
|
```python
|
|
def extract_gene_sequences(bed_file, fasta_file, output_fasta):
|
|
"""Extract sequences for genes from BED file."""
|
|
bed = pysam.TabixFile(bed_file)
|
|
fasta = pysam.FastaFile(fasta_file)
|
|
|
|
with open(output_fasta, 'w') as out:
|
|
for gene in bed.fetch(parser=pysam.asBed()):
|
|
sequence = fasta.fetch(gene.contig, gene.start, gene.end)
|
|
|
|
# Handle strand
|
|
if hasattr(gene, 'strand') and gene.strand == '-':
|
|
# Reverse complement
|
|
complement = str.maketrans("ATGCatgcNn", "TACGtacgNn")
|
|
sequence = sequence.translate(complement)[::-1]
|
|
|
|
out.write(f">{gene.name} {gene.contig}:{gene.start}-{gene.end}\n")
|
|
|
|
# Write sequence in 60-character lines
|
|
for i in range(0, len(sequence), 60):
|
|
out.write(sequence[i:i+60] + "\n")
|
|
|
|
bed.close()
|
|
fasta.close()
|
|
```
|
|
|
|
## Read Filtering and Subsetting
|
|
|
|
### Filter BAM by Region and Quality
|
|
|
|
```python
|
|
def filter_bam(input_bam, output_bam, chrom, start, end, min_mapq=20):
|
|
"""Filter BAM file by region and mapping quality."""
|
|
infile = pysam.AlignmentFile(input_bam, "rb")
|
|
outfile = pysam.AlignmentFile(output_bam, "wb", template=infile)
|
|
|
|
for read in infile.fetch(chrom, start, end):
|
|
if read.mapping_quality >= min_mapq and not read.is_duplicate:
|
|
outfile.write(read)
|
|
|
|
infile.close()
|
|
outfile.close()
|
|
|
|
# Create index
|
|
pysam.index(output_bam)
|
|
```
|
|
|
|
### Extract Reads for Specific Variants
|
|
|
|
```python
|
|
def extract_reads_at_variants(bam_file, vcf_file, output_bam, window=100):
|
|
"""Extract reads overlapping variant positions."""
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
vcf = pysam.VariantFile(vcf_file)
|
|
outfile = pysam.AlignmentFile(output_bam, "wb", template=samfile)
|
|
|
|
# Collect all reads (using set to avoid duplicates)
|
|
reads_to_keep = set()
|
|
|
|
for variant in vcf:
|
|
start = max(0, variant.pos - window - 1)
|
|
end = variant.pos + window
|
|
|
|
for read in samfile.fetch(variant.chrom, start, end):
|
|
reads_to_keep.add(read.query_name)
|
|
|
|
# Write all reads
|
|
samfile.close()
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
|
|
for read in samfile.fetch(until_eof=True):
|
|
if read.query_name in reads_to_keep:
|
|
outfile.write(read)
|
|
|
|
samfile.close()
|
|
vcf.close()
|
|
outfile.close()
|
|
|
|
pysam.index(output_bam)
|
|
```
|
|
|
|
## Integration Workflows
|
|
|
|
### Create Coverage Track from BAM
|
|
|
|
```python
|
|
def create_coverage_bedgraph(bam_file, output_file, chrom=None):
|
|
"""Create bedGraph coverage track from BAM."""
|
|
samfile = pysam.AlignmentFile(bam_file, "rb")
|
|
|
|
chroms = [chrom] if chrom else samfile.references
|
|
|
|
with open(output_file, 'w') as out:
|
|
out.write("track type=bedGraph name=\"Coverage\"\n")
|
|
|
|
for chrom in chroms:
|
|
current_cov = None
|
|
region_start = None
|
|
|
|
for pileupcolumn in samfile.pileup(chrom):
|
|
pos = pileupcolumn.pos
|
|
cov = pileupcolumn.nsegments
|
|
|
|
if cov != current_cov:
|
|
# Write previous region
|
|
if current_cov is not None:
|
|
out.write(f"{chrom}\t{region_start}\t{pos}\t{current_cov}\n")
|
|
|
|
# Start new region
|
|
current_cov = cov
|
|
region_start = pos
|
|
|
|
# Write final region
|
|
if current_cov is not None:
|
|
out.write(f"{chrom}\t{region_start}\t{pos+1}\t{current_cov}\n")
|
|
|
|
samfile.close()
|
|
```
|
|
|
|
### Merge Multiple VCF Files
|
|
|
|
```python
|
|
def merge_vcf_samples(vcf_files, output_file):
|
|
"""Merge multiple single-sample VCFs."""
|
|
# Open all input files
|
|
vcf_readers = [pysam.VariantFile(f) for f in vcf_files]
|
|
|
|
# Create merged header
|
|
merged_header = vcf_readers[0].header.copy()
|
|
for vcf in vcf_readers[1:]:
|
|
for sample in vcf.header.samples:
|
|
merged_header.samples.add(sample)
|
|
|
|
outvcf = pysam.VariantFile(output_file, "w", header=merged_header)
|
|
|
|
# Get all variant positions
|
|
all_variants = {}
|
|
for vcf in vcf_readers:
|
|
for variant in vcf:
|
|
key = (variant.chrom, variant.pos, variant.ref, variant.alts)
|
|
if key not in all_variants:
|
|
all_variants[key] = []
|
|
all_variants[key].append(variant)
|
|
|
|
# Write merged variants
|
|
for key, variants in sorted(all_variants.items()):
|
|
# Create merged record from first variant
|
|
merged = outvcf.new_record(
|
|
contig=variants[0].chrom,
|
|
start=variants[0].start,
|
|
stop=variants[0].stop,
|
|
alleles=variants[0].alleles
|
|
)
|
|
|
|
# Add genotypes from all samples
|
|
for variant in variants:
|
|
for sample in variant.samples:
|
|
merged.samples[sample].update(variant.samples[sample])
|
|
|
|
outvcf.write(merged)
|
|
|
|
# Close all files
|
|
for vcf in vcf_readers:
|
|
vcf.close()
|
|
outvcf.close()
|
|
```
|
|
|
|
## Performance Tips for Workflows
|
|
|
|
1. **Use indexed files** for all random access operations
|
|
2. **Process regions in parallel** when analyzing multiple independent regions
|
|
3. **Stream data when possible** - avoid loading entire files into memory
|
|
4. **Close files explicitly** to free resources
|
|
5. **Use `until_eof=True`** for sequential processing of entire files
|
|
6. **Batch operations** on the same file to minimize I/O
|
|
7. **Consider memory usage** with pileup operations on high-coverage regions
|
|
8. **Use count() instead of pileup()** when only counts are needed
|
|
|
|
## Common Integration Patterns
|
|
|
|
1. **BAM + Reference**: Verify alignments, extract aligned sequences
|
|
2. **BAM + VCF**: Validate variants, calculate allele frequencies
|
|
3. **VCF + BED**: Annotate variants with gene/region information
|
|
4. **BAM + BED**: Calculate coverage statistics for specific regions
|
|
5. **FASTA + VCF**: Extract variant context sequences
|
|
6. **Multiple BAMs**: Compare coverage or variants across samples
|
|
7. **BAM + FASTQ**: Extract unaligned reads for re-alignment
|