Files
gh-k-dense-ai-claude-scient…/skills/gget/scripts/batch_sequence_analysis.py
2025-11-30 08:30:10 +08:00

192 lines
5.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Batch Sequence Analysis Script
Analyze multiple sequences: BLAST, alignment, and structure prediction
"""
import argparse
import sys
from pathlib import Path
import gget
def read_fasta(fasta_file):
"""Read sequences from FASTA file."""
sequences = []
current_id = None
current_seq = []
with open(fasta_file, "r") as f:
for line in f:
line = line.strip()
if line.startswith(">"):
if current_id:
sequences.append({"id": current_id, "seq": "".join(current_seq)})
current_id = line[1:]
current_seq = []
else:
current_seq.append(line)
if current_id:
sequences.append({"id": current_id, "seq": "".join(current_seq)})
return sequences
def analyze_sequences(
fasta_file,
blast_db="nr",
align=True,
predict_structure=False,
output_dir="output",
):
"""
Perform batch sequence analysis.
Args:
fasta_file: Path to FASTA file with sequences
blast_db: BLAST database to search (default: nr)
align: Whether to perform multiple sequence alignment
predict_structure: Whether to predict structures with AlphaFold
output_dir: Output directory for results
"""
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
print(f"Batch Sequence Analysis")
print("=" * 60)
print(f"Input file: {fasta_file}")
print(f"Output directory: {output_dir}")
print("")
# Read sequences
print("Reading sequences...")
sequences = read_fasta(fasta_file)
print(f"Found {len(sequences)} sequences\n")
# Step 1: BLAST each sequence
print("Step 1: Running BLAST searches...")
print("-" * 60)
for i, seq_data in enumerate(sequences):
print(f"\n{i+1}. BLASTing {seq_data['id']}...")
try:
blast_results = gget.blast(
seq_data["seq"], database=blast_db, limit=10, save=False
)
output_file = output_path / f"{seq_data['id']}_blast.csv"
blast_results.to_csv(output_file, index=False)
print(f" Results saved to: {output_file}")
if len(blast_results) > 0:
print(f" Top hit: {blast_results.iloc[0]['Description']}")
print(
f" Max Score: {blast_results.iloc[0]['Max Score']}, "
f"Query Coverage: {blast_results.iloc[0]['Query Coverage']}"
)
except Exception as e:
print(f" Error: {e}")
# Step 2: Multiple sequence alignment
if align and len(sequences) > 1:
print("\n\nStep 2: Multiple sequence alignment...")
print("-" * 60)
try:
alignment = gget.muscle(fasta_file)
alignment_file = output_path / "alignment.afa"
with open(alignment_file, "w") as f:
f.write(alignment)
print(f"Alignment saved to: {alignment_file}")
except Exception as e:
print(f"Error in alignment: {e}")
else:
print("\n\nStep 2: Skipping alignment (only 1 sequence or disabled)")
# Step 3: Structure prediction (optional)
if predict_structure:
print("\n\nStep 3: Predicting structures with AlphaFold...")
print("-" * 60)
print(
"Note: This requires 'gget setup alphafold' and is computationally intensive"
)
for i, seq_data in enumerate(sequences):
print(f"\n{i+1}. Predicting structure for {seq_data['id']}...")
try:
structure_dir = output_path / f"structure_{seq_data['id']}"
# Uncomment to run AlphaFold prediction:
# gget.alphafold(seq_data['seq'], out=str(structure_dir))
# print(f" Structure saved to: {structure_dir}")
print(
" (Prediction skipped - uncomment code to run AlphaFold prediction)"
)
except Exception as e:
print(f" Error: {e}")
else:
print("\n\nStep 3: Structure prediction disabled")
# Summary
print("\n" + "=" * 60)
print("Batch analysis complete!")
print(f"\nResults saved to: {output_dir}/")
print(f" - BLAST results: *_blast.csv")
if align and len(sequences) > 1:
print(f" - Alignment: alignment.afa")
if predict_structure:
print(f" - Structures: structure_*/")
return True
def main():
parser = argparse.ArgumentParser(
description="Perform batch sequence analysis using gget"
)
parser.add_argument("fasta", help="Input FASTA file with sequences")
parser.add_argument(
"-db",
"--database",
default="nr",
help="BLAST database (default: nr for proteins, nt for nucleotides)",
)
parser.add_argument(
"--no-align", action="store_true", help="Skip multiple sequence alignment"
)
parser.add_argument(
"--predict-structure",
action="store_true",
help="Predict structures with AlphaFold (requires setup)",
)
parser.add_argument(
"-o", "--output", default="output", help="Output directory (default: output)"
)
args = parser.parse_args()
if not Path(args.fasta).exists():
print(f"Error: File not found: {args.fasta}")
sys.exit(1)
try:
success = analyze_sequences(
args.fasta,
blast_db=args.database,
align=not args.no_align,
predict_structure=args.predict_structure,
output_dir=args.output,
)
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\n\nAnalysis interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n\nError: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()