Initial commit

2025-11-29 18:02:37 +08:00
commit c1d9dee646
38 changed files with 11210 additions and 0 deletions
--- a/skills/phylo_from_buscos/scripts/download_ncbi_genomes.py
+++ b/skills/phylo_from_buscos/scripts/download_ncbi_genomes.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+Download genomes from NCBI using BioProject or Assembly accessions
+
+Usage:
+    python download_ncbi_genomes.py --bioprojects PRJNA12345 PRJEB67890
+    python download_ncbi_genomes.py --assemblies GCA_123456789.1 GCF_987654321.1
+
+Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
+
+Author: Bruno de Medeiros (Field Museum)
+Based on tutorials by Paul Frandsen (BYU)
+"""
+
+import argparse
+import sys
+import subprocess
+
+
+def download_using_cli(accessions, output_file="genomes.zip"):
+    """
+    Download genomes using NCBI datasets CLI
+
+    Args:
+        accessions: List of BioProject or Assembly accessions
+        output_file: Name of output zip file
+    """
+    cmd = ["datasets", "download", "genome", "accession"] + accessions + ["--filename", output_file]
+
+    print(f"Running: {' '.join(cmd)}")
+    print("")
+
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print(result.stdout)
+        print(f"\nDownload complete: {output_file}")
+        print("Extract with: unzip " + output_file)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Error downloading genomes: {e}", file=sys.stderr)
+        print(e.stderr, file=sys.stderr)
+        return False
+    except FileNotFoundError:
+        print("Error: 'datasets' command not found", file=sys.stderr)
+        print("Install with: conda install -c conda-forge ncbi-datasets-cli", file=sys.stderr)
+        return False
+
+
+def get_bioproject_assemblies(bioprojects):
+    """
+    Get assembly accessions for given BioProjects using Python API
+
+    Args:
+        bioprojects: List of BioProject accessions
+
+    Returns:
+        List of tuples (assembly_accession, organism_name)
+    """
+    try:
+        from ncbi.datasets.metadata.genome import get_assembly_metadata_by_bioproject_accessions
+    except ImportError:
+        print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
+        print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
+        sys.exit(1)
+
+    assemblies = []
+
+    print(f"Fetching assembly information for {len(bioprojects)} BioProject(s)...")
+    print("")
+
+    for assembly in get_assembly_metadata_by_bioproject_accessions(bioprojects):
+        acc = assembly.accession
+        name = assembly.organism.organism_name
+        assemblies.append((acc, name))
+        print(f"  {name}: {acc}")
+
+    print(f"\nFound {len(assemblies)} assemblies")
+
+    return assemblies
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download genomes from NCBI using BioProject or Assembly accessions"
+    )
+
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        "--bioprojects",
+        nargs="+",
+        help="BioProject accessions (e.g., PRJNA12345 PRJEB67890)"
+    )
+    group.add_argument(
+        "--assemblies",
+        nargs="+",
+        help="Assembly accessions (e.g., GCA_123456789.1 GCF_987654321.1)"
+    )
+
+    parser.add_argument(
+        "-o", "--output",
+        default="genomes.zip",
+        help="Output zip file name (default: genomes.zip)"
+    )
+
+    parser.add_argument(
+        "--list-only",
+        action="store_true",
+        help="List assemblies without downloading (BioProject mode only)"
+    )
+
+    args = parser.parse_args()
+
+    if args.bioprojects:
+        assemblies = get_bioproject_assemblies(args.bioprojects)
+
+        if args.list_only:
+            print("\nAssembly accessions (use with --assemblies to download):")
+            for acc, name in assemblies:
+                print(acc)
+            return
+
+        # Download assemblies
+        assembly_accs = [acc for acc, name in assemblies]
+        success = download_using_cli(assembly_accs, args.output)
+
+    elif args.assemblies:
+        success = download_using_cli(args.assemblies, args.output)
+
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()