175 lines
5.0 KiB
Python
Executable File
175 lines
5.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Query NCBI for available genome assemblies by taxon name
|
|
|
|
Usage:
|
|
python query_ncbi_assemblies.py --taxon "Coleoptera"
|
|
python query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50
|
|
python query_ncbi_assemblies.py --taxon "Apis" --refseq-only
|
|
|
|
Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
|
|
|
|
Author: Bruno de Medeiros (Field Museum)
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
|
|
|
|
def query_assemblies_by_taxon(taxon, max_results=20, refseq_only=False):
|
|
"""
|
|
Query NCBI for genome assemblies of a given taxon
|
|
|
|
Args:
|
|
taxon: Taxon name (e.g., "Coleoptera", "Drosophila melanogaster")
|
|
max_results: Maximum number of results to return
|
|
refseq_only: If True, only return RefSeq assemblies (GCF_*)
|
|
|
|
Returns:
|
|
List of dictionaries with assembly information
|
|
"""
|
|
try:
|
|
from ncbi.datasets import GenomeApi
|
|
from ncbi.datasets.openapi import ApiClient, ApiException
|
|
except ImportError:
|
|
print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
|
|
print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
assemblies = []
|
|
|
|
print(f"Querying NCBI for '{taxon}' genome assemblies...")
|
|
print(f"(Limiting to {max_results} results)")
|
|
if refseq_only:
|
|
print("(RefSeq assemblies only)")
|
|
print("")
|
|
|
|
try:
|
|
with ApiClient() as api_client:
|
|
api = GenomeApi(api_client)
|
|
|
|
# Query genome assemblies for the taxon
|
|
genome_summary = api.genome_summary_by_taxon(
|
|
taxon=taxon,
|
|
limit=str(max_results),
|
|
filters_refseq_only=refseq_only
|
|
)
|
|
|
|
if not genome_summary.reports:
|
|
print(f"No assemblies found for taxon '{taxon}'")
|
|
return []
|
|
|
|
for report in genome_summary.reports:
|
|
assembly_info = {
|
|
'accession': report.accession,
|
|
'organism': report.organism.organism_name,
|
|
'assembly_level': report.assembly_info.assembly_level,
|
|
'assembly_name': report.assembly_info.assembly_name,
|
|
'submission_date': report.assembly_info.release_date if hasattr(report.assembly_info, 'release_date') else 'N/A'
|
|
}
|
|
assemblies.append(assembly_info)
|
|
|
|
except ApiException as e:
|
|
print(f"Error querying NCBI: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Unexpected error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
return assemblies
|
|
|
|
|
|
def format_table(assemblies):
|
|
"""
|
|
Format assemblies as a readable table
|
|
|
|
Args:
|
|
assemblies: List of assembly dictionaries
|
|
"""
|
|
if not assemblies:
|
|
return
|
|
|
|
print(f"Found {len(assemblies)} assemblies:\n")
|
|
|
|
# Print header
|
|
print(f"{'#':<4} {'Accession':<20} {'Organism':<40} {'Level':<15} {'Assembly Name':<30}")
|
|
print("-" * 110)
|
|
|
|
# Print data rows
|
|
for i, asm in enumerate(assemblies, 1):
|
|
organism = asm['organism'][:38] + '..' if len(asm['organism']) > 40 else asm['organism']
|
|
assembly_name = asm['assembly_name'][:28] + '..' if len(asm['assembly_name']) > 30 else asm['assembly_name']
|
|
|
|
print(f"{i:<4} {asm['accession']:<20} {organism:<40} {asm['assembly_level']:<15} {assembly_name:<30}")
|
|
|
|
print("")
|
|
|
|
|
|
def save_accessions(assemblies, output_file):
|
|
"""
|
|
Save assembly accessions to a file
|
|
|
|
Args:
|
|
assemblies: List of assembly dictionaries
|
|
output_file: Output file path
|
|
"""
|
|
with open(output_file, 'w') as f:
|
|
for asm in assemblies:
|
|
f.write(f"{asm['accession']}\n")
|
|
|
|
print(f"Accessions saved to: {output_file}")
|
|
print(f"You can download these assemblies using:")
|
|
print(f" python download_ncbi_genomes.py --assemblies $(cat {output_file})")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Query NCBI for available genome assemblies by taxon name",
|
|
epilog="Example: python query_ncbi_assemblies.py --taxon 'Coleoptera' --max-results 50"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--taxon",
|
|
required=True,
|
|
help="Taxon name (e.g., 'Coleoptera', 'Drosophila melanogaster')"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--max-results",
|
|
type=int,
|
|
default=20,
|
|
help="Maximum number of results to return (default: 20)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--refseq-only",
|
|
action="store_true",
|
|
help="Only return RefSeq assemblies (GCF_* accessions)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--save",
|
|
metavar="FILE",
|
|
help="Save accessions to a file for later download"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Query NCBI
|
|
assemblies = query_assemblies_by_taxon(
|
|
taxon=args.taxon,
|
|
max_results=args.max_results,
|
|
refseq_only=args.refseq_only
|
|
)
|
|
|
|
# Display results
|
|
format_table(assemblies)
|
|
|
|
# Save if requested
|
|
if args.save and assemblies:
|
|
save_accessions(assemblies, args.save)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|