Initial commit
This commit is contained in:
520
skills/pysam/references/common_workflows.md
Normal file
520
skills/pysam/references/common_workflows.md
Normal file
@@ -0,0 +1,520 @@
|
||||
# Common Bioinformatics Workflows with Pysam
|
||||
|
||||
## Overview
|
||||
|
||||
This document provides practical examples of common bioinformatics workflows using pysam, demonstrating how to combine different file types and operations.
|
||||
|
||||
## Quality Control Workflows
|
||||
|
||||
### Calculate BAM Statistics
|
||||
|
||||
```python
|
||||
import pysam
|
||||
|
||||
def calculate_bam_stats(bam_file):
|
||||
"""Calculate basic statistics for BAM file."""
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
|
||||
stats = {
|
||||
"total_reads": 0,
|
||||
"mapped_reads": 0,
|
||||
"unmapped_reads": 0,
|
||||
"paired_reads": 0,
|
||||
"proper_pairs": 0,
|
||||
"duplicates": 0,
|
||||
"total_bases": 0,
|
||||
"mapped_bases": 0
|
||||
}
|
||||
|
||||
for read in samfile.fetch(until_eof=True):
|
||||
stats["total_reads"] += 1
|
||||
|
||||
if read.is_unmapped:
|
||||
stats["unmapped_reads"] += 1
|
||||
else:
|
||||
stats["mapped_reads"] += 1
|
||||
stats["mapped_bases"] += read.query_alignment_length
|
||||
|
||||
if read.is_paired:
|
||||
stats["paired_reads"] += 1
|
||||
if read.is_proper_pair:
|
||||
stats["proper_pairs"] += 1
|
||||
|
||||
if read.is_duplicate:
|
||||
stats["duplicates"] += 1
|
||||
|
||||
stats["total_bases"] += read.query_length
|
||||
|
||||
samfile.close()
|
||||
|
||||
# Calculate derived statistics
|
||||
stats["mapping_rate"] = stats["mapped_reads"] / stats["total_reads"] if stats["total_reads"] > 0 else 0
|
||||
stats["duplication_rate"] = stats["duplicates"] / stats["total_reads"] if stats["total_reads"] > 0 else 0
|
||||
|
||||
return stats
|
||||
```
|
||||
|
||||
### Check Reference Consistency
|
||||
|
||||
```python
|
||||
def check_bam_reference_consistency(bam_file, fasta_file):
|
||||
"""Verify that BAM reads match reference genome."""
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
fasta = pysam.FastaFile(fasta_file)
|
||||
|
||||
mismatches = 0
|
||||
total_checked = 0
|
||||
|
||||
for read in samfile.fetch():
|
||||
if read.is_unmapped:
|
||||
continue
|
||||
|
||||
# Get reference sequence for aligned region
|
||||
ref_seq = fasta.fetch(
|
||||
read.reference_name,
|
||||
read.reference_start,
|
||||
read.reference_end
|
||||
)
|
||||
|
||||
# Get read sequence aligned to reference
|
||||
aligned_pairs = read.get_aligned_pairs(with_seq=True)
|
||||
|
||||
for query_pos, ref_pos, ref_base in aligned_pairs:
|
||||
if query_pos is not None and ref_pos is not None and ref_base is not None:
|
||||
read_base = read.query_sequence[query_pos]
|
||||
if read_base.upper() != ref_base.upper():
|
||||
mismatches += 1
|
||||
total_checked += 1
|
||||
|
||||
if total_checked >= 10000: # Sample first 10k positions
|
||||
break
|
||||
|
||||
samfile.close()
|
||||
fasta.close()
|
||||
|
||||
error_rate = mismatches / total_checked if total_checked > 0 else 0
|
||||
return {
|
||||
"positions_checked": total_checked,
|
||||
"mismatches": mismatches,
|
||||
"error_rate": error_rate
|
||||
}
|
||||
```
|
||||
|
||||
## Coverage Analysis
|
||||
|
||||
### Calculate Per-Base Coverage
|
||||
|
||||
```python
|
||||
def calculate_coverage(bam_file, chrom, start, end):
|
||||
"""Calculate coverage for each position in region."""
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
|
||||
# Initialize coverage array
|
||||
length = end - start
|
||||
coverage = [0] * length
|
||||
|
||||
# Count coverage at each position
|
||||
for pileupcolumn in samfile.pileup(chrom, start, end):
|
||||
if start <= pileupcolumn.pos < end:
|
||||
coverage[pileupcolumn.pos - start] = pileupcolumn.nsegments
|
||||
|
||||
samfile.close()
|
||||
|
||||
return coverage
|
||||
```
|
||||
|
||||
### Identify Low Coverage Regions
|
||||
|
||||
```python
|
||||
def find_low_coverage_regions(bam_file, chrom, start, end, min_coverage=10):
|
||||
"""Find regions with coverage below threshold."""
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
|
||||
low_coverage_regions = []
|
||||
in_low_region = False
|
||||
region_start = None
|
||||
|
||||
for pileupcolumn in samfile.pileup(chrom, start, end):
|
||||
pos = pileupcolumn.pos
|
||||
if pos < start or pos >= end:
|
||||
continue
|
||||
|
||||
coverage = pileupcolumn.nsegments
|
||||
|
||||
if coverage < min_coverage:
|
||||
if not in_low_region:
|
||||
region_start = pos
|
||||
in_low_region = True
|
||||
else:
|
||||
if in_low_region:
|
||||
low_coverage_regions.append((region_start, pos))
|
||||
in_low_region = False
|
||||
|
||||
# Close last region if still open
|
||||
if in_low_region:
|
||||
low_coverage_regions.append((region_start, end))
|
||||
|
||||
samfile.close()
|
||||
|
||||
return low_coverage_regions
|
||||
```
|
||||
|
||||
### Calculate Coverage Statistics
|
||||
|
||||
```python
|
||||
def coverage_statistics(bam_file, chrom, start, end):
|
||||
"""Calculate coverage statistics for region."""
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
|
||||
coverages = []
|
||||
|
||||
for pileupcolumn in samfile.pileup(chrom, start, end):
|
||||
if start <= pileupcolumn.pos < end:
|
||||
coverages.append(pileupcolumn.nsegments)
|
||||
|
||||
samfile.close()
|
||||
|
||||
if not coverages:
|
||||
return None
|
||||
|
||||
coverages.sort()
|
||||
n = len(coverages)
|
||||
|
||||
return {
|
||||
"mean": sum(coverages) / n,
|
||||
"median": coverages[n // 2],
|
||||
"min": coverages[0],
|
||||
"max": coverages[-1],
|
||||
"positions": n
|
||||
}
|
||||
```
|
||||
|
||||
## Variant Analysis
|
||||
|
||||
### Extract Variants in Regions
|
||||
|
||||
```python
|
||||
def extract_variants_in_genes(vcf_file, bed_file):
|
||||
"""Extract variants overlapping gene regions."""
|
||||
vcf = pysam.VariantFile(vcf_file)
|
||||
bed = pysam.TabixFile(bed_file)
|
||||
|
||||
variants_by_gene = {}
|
||||
|
||||
for gene in bed.fetch(parser=pysam.asBed()):
|
||||
gene_name = gene.name
|
||||
variants_by_gene[gene_name] = []
|
||||
|
||||
# Find variants in gene region
|
||||
for variant in vcf.fetch(gene.contig, gene.start, gene.end):
|
||||
variant_info = {
|
||||
"chrom": variant.chrom,
|
||||
"pos": variant.pos,
|
||||
"ref": variant.ref,
|
||||
"alt": variant.alts,
|
||||
"qual": variant.qual
|
||||
}
|
||||
variants_by_gene[gene_name].append(variant_info)
|
||||
|
||||
vcf.close()
|
||||
bed.close()
|
||||
|
||||
return variants_by_gene
|
||||
```
|
||||
|
||||
### Annotate Variants with Coverage
|
||||
|
||||
```python
|
||||
def annotate_variants_with_coverage(vcf_file, bam_file, output_file):
|
||||
"""Add coverage information to variants."""
|
||||
vcf = pysam.VariantFile(vcf_file)
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
|
||||
# Add DP to header if not present
|
||||
if "DP" not in vcf.header.info:
|
||||
vcf.header.info.add("DP", "1", "Integer", "Total Depth from BAM")
|
||||
|
||||
outvcf = pysam.VariantFile(output_file, "w", header=vcf.header)
|
||||
|
||||
for variant in vcf:
|
||||
# Get coverage at variant position
|
||||
coverage = samfile.count(
|
||||
variant.chrom,
|
||||
variant.pos - 1, # Convert to 0-based
|
||||
variant.pos
|
||||
)
|
||||
|
||||
# Add to INFO field
|
||||
variant.info["DP"] = coverage
|
||||
|
||||
outvcf.write(variant)
|
||||
|
||||
vcf.close()
|
||||
samfile.close()
|
||||
outvcf.close()
|
||||
```
|
||||
|
||||
### Filter Variants by Read Support
|
||||
|
||||
```python
|
||||
def filter_variants_by_support(vcf_file, bam_file, output_file, min_alt_reads=3):
|
||||
"""Filter variants requiring minimum alternate allele support."""
|
||||
vcf = pysam.VariantFile(vcf_file)
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
outvcf = pysam.VariantFile(output_file, "w", header=vcf.header)
|
||||
|
||||
for variant in vcf:
|
||||
# Count reads supporting each allele
|
||||
allele_counts = {variant.ref: 0}
|
||||
for alt in variant.alts:
|
||||
allele_counts[alt] = 0
|
||||
|
||||
# Pileup at variant position
|
||||
for pileupcolumn in samfile.pileup(
|
||||
variant.chrom,
|
||||
variant.pos - 1,
|
||||
variant.pos
|
||||
):
|
||||
if pileupcolumn.pos == variant.pos - 1: # 0-based
|
||||
for pileupread in pileupcolumn.pileups:
|
||||
if not pileupread.is_del and not pileupread.is_refskip:
|
||||
base = pileupread.alignment.query_sequence[
|
||||
pileupread.query_position
|
||||
]
|
||||
if base in allele_counts:
|
||||
allele_counts[base] += 1
|
||||
|
||||
# Check if any alt allele has sufficient support
|
||||
has_support = any(
|
||||
allele_counts.get(alt, 0) >= min_alt_reads
|
||||
for alt in variant.alts
|
||||
)
|
||||
|
||||
if has_support:
|
||||
outvcf.write(variant)
|
||||
|
||||
vcf.close()
|
||||
samfile.close()
|
||||
outvcf.close()
|
||||
```
|
||||
|
||||
## Sequence Extraction
|
||||
|
||||
### Extract Sequences Around Variants
|
||||
|
||||
```python
|
||||
def extract_variant_contexts(vcf_file, fasta_file, output_file, window=50):
|
||||
"""Extract reference sequences around variants."""
|
||||
vcf = pysam.VariantFile(vcf_file)
|
||||
fasta = pysam.FastaFile(fasta_file)
|
||||
|
||||
with open(output_file, 'w') as out:
|
||||
for variant in vcf:
|
||||
# Get sequence context
|
||||
start = max(0, variant.pos - window - 1) # Convert to 0-based
|
||||
end = variant.pos + window
|
||||
|
||||
context = fasta.fetch(variant.chrom, start, end)
|
||||
|
||||
# Mark variant position
|
||||
var_pos_in_context = variant.pos - 1 - start
|
||||
|
||||
out.write(f">{variant.chrom}:{variant.pos} {variant.ref}>{variant.alts}\n")
|
||||
out.write(context[:var_pos_in_context].lower())
|
||||
out.write(context[var_pos_in_context:var_pos_in_context+len(variant.ref)].upper())
|
||||
out.write(context[var_pos_in_context+len(variant.ref):].lower())
|
||||
out.write("\n")
|
||||
|
||||
vcf.close()
|
||||
fasta.close()
|
||||
```
|
||||
|
||||
### Extract Gene Sequences
|
||||
|
||||
```python
|
||||
def extract_gene_sequences(bed_file, fasta_file, output_fasta):
|
||||
"""Extract sequences for genes from BED file."""
|
||||
bed = pysam.TabixFile(bed_file)
|
||||
fasta = pysam.FastaFile(fasta_file)
|
||||
|
||||
with open(output_fasta, 'w') as out:
|
||||
for gene in bed.fetch(parser=pysam.asBed()):
|
||||
sequence = fasta.fetch(gene.contig, gene.start, gene.end)
|
||||
|
||||
# Handle strand
|
||||
if hasattr(gene, 'strand') and gene.strand == '-':
|
||||
# Reverse complement
|
||||
complement = str.maketrans("ATGCatgcNn", "TACGtacgNn")
|
||||
sequence = sequence.translate(complement)[::-1]
|
||||
|
||||
out.write(f">{gene.name} {gene.contig}:{gene.start}-{gene.end}\n")
|
||||
|
||||
# Write sequence in 60-character lines
|
||||
for i in range(0, len(sequence), 60):
|
||||
out.write(sequence[i:i+60] + "\n")
|
||||
|
||||
bed.close()
|
||||
fasta.close()
|
||||
```
|
||||
|
||||
## Read Filtering and Subsetting
|
||||
|
||||
### Filter BAM by Region and Quality
|
||||
|
||||
```python
|
||||
def filter_bam(input_bam, output_bam, chrom, start, end, min_mapq=20):
|
||||
"""Filter BAM file by region and mapping quality."""
|
||||
infile = pysam.AlignmentFile(input_bam, "rb")
|
||||
outfile = pysam.AlignmentFile(output_bam, "wb", template=infile)
|
||||
|
||||
for read in infile.fetch(chrom, start, end):
|
||||
if read.mapping_quality >= min_mapq and not read.is_duplicate:
|
||||
outfile.write(read)
|
||||
|
||||
infile.close()
|
||||
outfile.close()
|
||||
|
||||
# Create index
|
||||
pysam.index(output_bam)
|
||||
```
|
||||
|
||||
### Extract Reads for Specific Variants
|
||||
|
||||
```python
|
||||
def extract_reads_at_variants(bam_file, vcf_file, output_bam, window=100):
|
||||
"""Extract reads overlapping variant positions."""
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
vcf = pysam.VariantFile(vcf_file)
|
||||
outfile = pysam.AlignmentFile(output_bam, "wb", template=samfile)
|
||||
|
||||
# Collect all reads (using set to avoid duplicates)
|
||||
reads_to_keep = set()
|
||||
|
||||
for variant in vcf:
|
||||
start = max(0, variant.pos - window - 1)
|
||||
end = variant.pos + window
|
||||
|
||||
for read in samfile.fetch(variant.chrom, start, end):
|
||||
reads_to_keep.add(read.query_name)
|
||||
|
||||
# Write all reads
|
||||
samfile.close()
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
|
||||
for read in samfile.fetch(until_eof=True):
|
||||
if read.query_name in reads_to_keep:
|
||||
outfile.write(read)
|
||||
|
||||
samfile.close()
|
||||
vcf.close()
|
||||
outfile.close()
|
||||
|
||||
pysam.index(output_bam)
|
||||
```
|
||||
|
||||
## Integration Workflows
|
||||
|
||||
### Create Coverage Track from BAM
|
||||
|
||||
```python
|
||||
def create_coverage_bedgraph(bam_file, output_file, chrom=None):
|
||||
"""Create bedGraph coverage track from BAM."""
|
||||
samfile = pysam.AlignmentFile(bam_file, "rb")
|
||||
|
||||
chroms = [chrom] if chrom else samfile.references
|
||||
|
||||
with open(output_file, 'w') as out:
|
||||
out.write("track type=bedGraph name=\"Coverage\"\n")
|
||||
|
||||
for chrom in chroms:
|
||||
current_cov = None
|
||||
region_start = None
|
||||
|
||||
for pileupcolumn in samfile.pileup(chrom):
|
||||
pos = pileupcolumn.pos
|
||||
cov = pileupcolumn.nsegments
|
||||
|
||||
if cov != current_cov:
|
||||
# Write previous region
|
||||
if current_cov is not None:
|
||||
out.write(f"{chrom}\t{region_start}\t{pos}\t{current_cov}\n")
|
||||
|
||||
# Start new region
|
||||
current_cov = cov
|
||||
region_start = pos
|
||||
|
||||
# Write final region
|
||||
if current_cov is not None:
|
||||
out.write(f"{chrom}\t{region_start}\t{pos+1}\t{current_cov}\n")
|
||||
|
||||
samfile.close()
|
||||
```
|
||||
|
||||
### Merge Multiple VCF Files
|
||||
|
||||
```python
|
||||
def merge_vcf_samples(vcf_files, output_file):
|
||||
"""Merge multiple single-sample VCFs."""
|
||||
# Open all input files
|
||||
vcf_readers = [pysam.VariantFile(f) for f in vcf_files]
|
||||
|
||||
# Create merged header
|
||||
merged_header = vcf_readers[0].header.copy()
|
||||
for vcf in vcf_readers[1:]:
|
||||
for sample in vcf.header.samples:
|
||||
merged_header.samples.add(sample)
|
||||
|
||||
outvcf = pysam.VariantFile(output_file, "w", header=merged_header)
|
||||
|
||||
# Get all variant positions
|
||||
all_variants = {}
|
||||
for vcf in vcf_readers:
|
||||
for variant in vcf:
|
||||
key = (variant.chrom, variant.pos, variant.ref, variant.alts)
|
||||
if key not in all_variants:
|
||||
all_variants[key] = []
|
||||
all_variants[key].append(variant)
|
||||
|
||||
# Write merged variants
|
||||
for key, variants in sorted(all_variants.items()):
|
||||
# Create merged record from first variant
|
||||
merged = outvcf.new_record(
|
||||
contig=variants[0].chrom,
|
||||
start=variants[0].start,
|
||||
stop=variants[0].stop,
|
||||
alleles=variants[0].alleles
|
||||
)
|
||||
|
||||
# Add genotypes from all samples
|
||||
for variant in variants:
|
||||
for sample in variant.samples:
|
||||
merged.samples[sample].update(variant.samples[sample])
|
||||
|
||||
outvcf.write(merged)
|
||||
|
||||
# Close all files
|
||||
for vcf in vcf_readers:
|
||||
vcf.close()
|
||||
outvcf.close()
|
||||
```
|
||||
|
||||
## Performance Tips for Workflows
|
||||
|
||||
1. **Use indexed files** for all random access operations
|
||||
2. **Process regions in parallel** when analyzing multiple independent regions
|
||||
3. **Stream data when possible** - avoid loading entire files into memory
|
||||
4. **Close files explicitly** to free resources
|
||||
5. **Use `until_eof=True`** for sequential processing of entire files
|
||||
6. **Batch operations** on the same file to minimize I/O
|
||||
7. **Consider memory usage** with pileup operations on high-coverage regions
|
||||
8. **Use count() instead of pileup()** when only counts are needed
|
||||
|
||||
## Common Integration Patterns
|
||||
|
||||
1. **BAM + Reference**: Verify alignments, extract aligned sequences
|
||||
2. **BAM + VCF**: Validate variants, calculate allele frequencies
|
||||
3. **VCF + BED**: Annotate variants with gene/region information
|
||||
4. **BAM + BED**: Calculate coverage statistics for specific regions
|
||||
5. **FASTA + VCF**: Extract variant context sequences
|
||||
6. **Multiple BAMs**: Compare coverage or variants across samples
|
||||
7. **BAM + FASTQ**: Extract unaligned reads for re-alignment
|
||||
Reference in New Issue
Block a user