Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:02:37 +08:00
commit c1d9dee646
38 changed files with 11210 additions and 0 deletions

View File

@@ -0,0 +1,133 @@
#!/usr/bin/env python3
"""
Download genomes from NCBI using BioProject or Assembly accessions
Usage:
python download_ncbi_genomes.py --bioprojects PRJNA12345 PRJEB67890
python download_ncbi_genomes.py --assemblies GCA_123456789.1 GCF_987654321.1
Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
Author: Bruno de Medeiros (Field Museum)
Based on tutorials by Paul Frandsen (BYU)
"""
import argparse
import sys
import subprocess
def download_using_cli(accessions, output_file="genomes.zip"):
"""
Download genomes using NCBI datasets CLI
Args:
accessions: List of BioProject or Assembly accessions
output_file: Name of output zip file
"""
cmd = ["datasets", "download", "genome", "accession"] + accessions + ["--filename", output_file]
print(f"Running: {' '.join(cmd)}")
print("")
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print(result.stdout)
print(f"\nDownload complete: {output_file}")
print("Extract with: unzip " + output_file)
return True
except subprocess.CalledProcessError as e:
print(f"Error downloading genomes: {e}", file=sys.stderr)
print(e.stderr, file=sys.stderr)
return False
except FileNotFoundError:
print("Error: 'datasets' command not found", file=sys.stderr)
print("Install with: conda install -c conda-forge ncbi-datasets-cli", file=sys.stderr)
return False
def get_bioproject_assemblies(bioprojects):
"""
Get assembly accessions for given BioProjects using Python API
Args:
bioprojects: List of BioProject accessions
Returns:
List of tuples (assembly_accession, organism_name)
"""
try:
from ncbi.datasets.metadata.genome import get_assembly_metadata_by_bioproject_accessions
except ImportError:
print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
sys.exit(1)
assemblies = []
print(f"Fetching assembly information for {len(bioprojects)} BioProject(s)...")
print("")
for assembly in get_assembly_metadata_by_bioproject_accessions(bioprojects):
acc = assembly.accession
name = assembly.organism.organism_name
assemblies.append((acc, name))
print(f" {name}: {acc}")
print(f"\nFound {len(assemblies)} assemblies")
return assemblies
def main():
parser = argparse.ArgumentParser(
description="Download genomes from NCBI using BioProject or Assembly accessions"
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--bioprojects",
nargs="+",
help="BioProject accessions (e.g., PRJNA12345 PRJEB67890)"
)
group.add_argument(
"--assemblies",
nargs="+",
help="Assembly accessions (e.g., GCA_123456789.1 GCF_987654321.1)"
)
parser.add_argument(
"-o", "--output",
default="genomes.zip",
help="Output zip file name (default: genomes.zip)"
)
parser.add_argument(
"--list-only",
action="store_true",
help="List assemblies without downloading (BioProject mode only)"
)
args = parser.parse_args()
if args.bioprojects:
assemblies = get_bioproject_assemblies(args.bioprojects)
if args.list_only:
print("\nAssembly accessions (use with --assemblies to download):")
for acc, name in assemblies:
print(acc)
return
# Download assemblies
assembly_accs = [acc for acc, name in assemblies]
success = download_using_cli(assembly_accs, args.output)
elif args.assemblies:
success = download_using_cli(args.assemblies, args.output)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()