Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:02:37 +08:00
commit c1d9dee646
38 changed files with 11210 additions and 0 deletions

View File

@@ -0,0 +1,174 @@
#!/usr/bin/env python3
"""
Query NCBI for available genome assemblies by taxon name
Usage:
python query_ncbi_assemblies.py --taxon "Coleoptera"
python query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50
python query_ncbi_assemblies.py --taxon "Apis" --refseq-only
Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
Author: Bruno de Medeiros (Field Museum)
"""
import argparse
import sys
def query_assemblies_by_taxon(taxon, max_results=20, refseq_only=False):
"""
Query NCBI for genome assemblies of a given taxon
Args:
taxon: Taxon name (e.g., "Coleoptera", "Drosophila melanogaster")
max_results: Maximum number of results to return
refseq_only: If True, only return RefSeq assemblies (GCF_*)
Returns:
List of dictionaries with assembly information
"""
try:
from ncbi.datasets import GenomeApi
from ncbi.datasets.openapi import ApiClient, ApiException
except ImportError:
print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
sys.exit(1)
assemblies = []
print(f"Querying NCBI for '{taxon}' genome assemblies...")
print(f"(Limiting to {max_results} results)")
if refseq_only:
print("(RefSeq assemblies only)")
print("")
try:
with ApiClient() as api_client:
api = GenomeApi(api_client)
# Query genome assemblies for the taxon
genome_summary = api.genome_summary_by_taxon(
taxon=taxon,
limit=str(max_results),
filters_refseq_only=refseq_only
)
if not genome_summary.reports:
print(f"No assemblies found for taxon '{taxon}'")
return []
for report in genome_summary.reports:
assembly_info = {
'accession': report.accession,
'organism': report.organism.organism_name,
'assembly_level': report.assembly_info.assembly_level,
'assembly_name': report.assembly_info.assembly_name,
'submission_date': report.assembly_info.release_date if hasattr(report.assembly_info, 'release_date') else 'N/A'
}
assemblies.append(assembly_info)
except ApiException as e:
print(f"Error querying NCBI: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Unexpected error: {e}", file=sys.stderr)
sys.exit(1)
return assemblies
def format_table(assemblies):
"""
Format assemblies as a readable table
Args:
assemblies: List of assembly dictionaries
"""
if not assemblies:
return
print(f"Found {len(assemblies)} assemblies:\n")
# Print header
print(f"{'#':<4} {'Accession':<20} {'Organism':<40} {'Level':<15} {'Assembly Name':<30}")
print("-" * 110)
# Print data rows
for i, asm in enumerate(assemblies, 1):
organism = asm['organism'][:38] + '..' if len(asm['organism']) > 40 else asm['organism']
assembly_name = asm['assembly_name'][:28] + '..' if len(asm['assembly_name']) > 30 else asm['assembly_name']
print(f"{i:<4} {asm['accession']:<20} {organism:<40} {asm['assembly_level']:<15} {assembly_name:<30}")
print("")
def save_accessions(assemblies, output_file):
"""
Save assembly accessions to a file
Args:
assemblies: List of assembly dictionaries
output_file: Output file path
"""
with open(output_file, 'w') as f:
for asm in assemblies:
f.write(f"{asm['accession']}\n")
print(f"Accessions saved to: {output_file}")
print(f"You can download these assemblies using:")
print(f" python download_ncbi_genomes.py --assemblies $(cat {output_file})")
def main():
parser = argparse.ArgumentParser(
description="Query NCBI for available genome assemblies by taxon name",
epilog="Example: python query_ncbi_assemblies.py --taxon 'Coleoptera' --max-results 50"
)
parser.add_argument(
"--taxon",
required=True,
help="Taxon name (e.g., 'Coleoptera', 'Drosophila melanogaster')"
)
parser.add_argument(
"--max-results",
type=int,
default=20,
help="Maximum number of results to return (default: 20)"
)
parser.add_argument(
"--refseq-only",
action="store_true",
help="Only return RefSeq assemblies (GCF_* accessions)"
)
parser.add_argument(
"--save",
metavar="FILE",
help="Save accessions to a file for later download"
)
args = parser.parse_args()
# Query NCBI
assemblies = query_assemblies_by_taxon(
taxon=args.taxon,
max_results=args.max_results,
refseq_only=args.refseq_only
)
# Display results
format_table(assemblies)
# Save if requested
if args.save and assemblies:
save_accessions(assemblies, args.save)
if __name__ == "__main__":
main()