Initial commit
This commit is contained in:
174
skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py
Executable file
174
skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py
Executable file
@@ -0,0 +1,174 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Query NCBI for available genome assemblies by taxon name
|
||||
|
||||
Usage:
|
||||
python query_ncbi_assemblies.py --taxon "Coleoptera"
|
||||
python query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50
|
||||
python query_ncbi_assemblies.py --taxon "Apis" --refseq-only
|
||||
|
||||
Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
|
||||
|
||||
Author: Bruno de Medeiros (Field Museum)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
def query_assemblies_by_taxon(taxon, max_results=20, refseq_only=False):
|
||||
"""
|
||||
Query NCBI for genome assemblies of a given taxon
|
||||
|
||||
Args:
|
||||
taxon: Taxon name (e.g., "Coleoptera", "Drosophila melanogaster")
|
||||
max_results: Maximum number of results to return
|
||||
refseq_only: If True, only return RefSeq assemblies (GCF_*)
|
||||
|
||||
Returns:
|
||||
List of dictionaries with assembly information
|
||||
"""
|
||||
try:
|
||||
from ncbi.datasets import GenomeApi
|
||||
from ncbi.datasets.openapi import ApiClient, ApiException
|
||||
except ImportError:
|
||||
print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
|
||||
print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
assemblies = []
|
||||
|
||||
print(f"Querying NCBI for '{taxon}' genome assemblies...")
|
||||
print(f"(Limiting to {max_results} results)")
|
||||
if refseq_only:
|
||||
print("(RefSeq assemblies only)")
|
||||
print("")
|
||||
|
||||
try:
|
||||
with ApiClient() as api_client:
|
||||
api = GenomeApi(api_client)
|
||||
|
||||
# Query genome assemblies for the taxon
|
||||
genome_summary = api.genome_summary_by_taxon(
|
||||
taxon=taxon,
|
||||
limit=str(max_results),
|
||||
filters_refseq_only=refseq_only
|
||||
)
|
||||
|
||||
if not genome_summary.reports:
|
||||
print(f"No assemblies found for taxon '{taxon}'")
|
||||
return []
|
||||
|
||||
for report in genome_summary.reports:
|
||||
assembly_info = {
|
||||
'accession': report.accession,
|
||||
'organism': report.organism.organism_name,
|
||||
'assembly_level': report.assembly_info.assembly_level,
|
||||
'assembly_name': report.assembly_info.assembly_name,
|
||||
'submission_date': report.assembly_info.release_date if hasattr(report.assembly_info, 'release_date') else 'N/A'
|
||||
}
|
||||
assemblies.append(assembly_info)
|
||||
|
||||
except ApiException as e:
|
||||
print(f"Error querying NCBI: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Unexpected error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
return assemblies
|
||||
|
||||
|
||||
def format_table(assemblies):
|
||||
"""
|
||||
Format assemblies as a readable table
|
||||
|
||||
Args:
|
||||
assemblies: List of assembly dictionaries
|
||||
"""
|
||||
if not assemblies:
|
||||
return
|
||||
|
||||
print(f"Found {len(assemblies)} assemblies:\n")
|
||||
|
||||
# Print header
|
||||
print(f"{'#':<4} {'Accession':<20} {'Organism':<40} {'Level':<15} {'Assembly Name':<30}")
|
||||
print("-" * 110)
|
||||
|
||||
# Print data rows
|
||||
for i, asm in enumerate(assemblies, 1):
|
||||
organism = asm['organism'][:38] + '..' if len(asm['organism']) > 40 else asm['organism']
|
||||
assembly_name = asm['assembly_name'][:28] + '..' if len(asm['assembly_name']) > 30 else asm['assembly_name']
|
||||
|
||||
print(f"{i:<4} {asm['accession']:<20} {organism:<40} {asm['assembly_level']:<15} {assembly_name:<30}")
|
||||
|
||||
print("")
|
||||
|
||||
|
||||
def save_accessions(assemblies, output_file):
|
||||
"""
|
||||
Save assembly accessions to a file
|
||||
|
||||
Args:
|
||||
assemblies: List of assembly dictionaries
|
||||
output_file: Output file path
|
||||
"""
|
||||
with open(output_file, 'w') as f:
|
||||
for asm in assemblies:
|
||||
f.write(f"{asm['accession']}\n")
|
||||
|
||||
print(f"Accessions saved to: {output_file}")
|
||||
print(f"You can download these assemblies using:")
|
||||
print(f" python download_ncbi_genomes.py --assemblies $(cat {output_file})")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Query NCBI for available genome assemblies by taxon name",
|
||||
epilog="Example: python query_ncbi_assemblies.py --taxon 'Coleoptera' --max-results 50"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--taxon",
|
||||
required=True,
|
||||
help="Taxon name (e.g., 'Coleoptera', 'Drosophila melanogaster')"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--max-results",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Maximum number of results to return (default: 20)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--refseq-only",
|
||||
action="store_true",
|
||||
help="Only return RefSeq assemblies (GCF_* accessions)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--save",
|
||||
metavar="FILE",
|
||||
help="Save accessions to a file for later download"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Query NCBI
|
||||
assemblies = query_assemblies_by_taxon(
|
||||
taxon=args.taxon,
|
||||
max_results=args.max_results,
|
||||
refseq_only=args.refseq_only
|
||||
)
|
||||
|
||||
# Display results
|
||||
format_table(assemblies)
|
||||
|
||||
# Save if requested
|
||||
if args.save and assemblies:
|
||||
save_accessions(assemblies, args.save)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user