Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/gene-database/scripts/fetch_gene_data.py
+++ b/skills/gene-database/scripts/fetch_gene_data.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""
+Fetch gene data from NCBI using the Datasets API.
+
+This script provides access to the NCBI Datasets API for retrieving
+comprehensive gene information including metadata and sequences.
+"""
+
+import argparse
+import json
+import sys
+import urllib.parse
+import urllib.request
+from typing import Optional, Dict, Any, List
+
+
+DATASETS_API_BASE = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/gene"
+
+
+def get_taxon_id(taxon_name: str) -> Optional[str]:
+    """
+    Convert taxon name to NCBI taxon ID.
+
+    Args:
+        taxon_name: Common or scientific name (e.g., "human", "Homo sapiens")
+
+    Returns:
+        Taxon ID as string, or None if not found
+    """
+    # Common mappings
+    common_taxa = {
+        'human': '9606',
+        'homo sapiens': '9606',
+        'mouse': '10090',
+        'mus musculus': '10090',
+        'rat': '10116',
+        'rattus norvegicus': '10116',
+        'zebrafish': '7955',
+        'danio rerio': '7955',
+        'fruit fly': '7227',
+        'drosophila melanogaster': '7227',
+        'c. elegans': '6239',
+        'caenorhabditis elegans': '6239',
+        'yeast': '4932',
+        'saccharomyces cerevisiae': '4932',
+        'arabidopsis': '3702',
+        'arabidopsis thaliana': '3702',
+        'e. coli': '562',
+        'escherichia coli': '562',
+    }
+
+    taxon_lower = taxon_name.lower().strip()
+    return common_taxa.get(taxon_lower)
+
+
+def fetch_gene_by_id(gene_id: str, api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Fetch gene data by Gene ID.
+
+    Args:
+        gene_id: NCBI Gene ID
+        api_key: Optional NCBI API key
+
+    Returns:
+        Gene data as dictionary
+    """
+    url = f"{DATASETS_API_BASE}/id/{gene_id}"
+
+    headers = {}
+    if api_key:
+        headers['api-key'] = api_key
+
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req) as response:
+            return json.loads(response.read().decode())
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        if e.code == 404:
+            print(f"Gene ID {gene_id} not found", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return {}
+
+
+def fetch_gene_by_symbol(symbol: str, taxon: str, api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Fetch gene data by gene symbol and taxon.
+
+    Args:
+        symbol: Gene symbol (e.g., "BRCA1")
+        taxon: Organism name or taxon ID
+        api_key: Optional NCBI API key
+
+    Returns:
+        Gene data as dictionary
+    """
+    # Convert taxon name to ID if needed
+    taxon_id = get_taxon_id(taxon)
+    if not taxon_id:
+        # Try to use as-is (might already be a taxon ID)
+        taxon_id = taxon
+
+    url = f"{DATASETS_API_BASE}/symbol/{symbol}/taxon/{taxon_id}"
+
+    headers = {}
+    if api_key:
+        headers['api-key'] = api_key
+
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req) as response:
+            return json.loads(response.read().decode())
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        if e.code == 404:
+            print(f"Gene symbol '{symbol}' not found for taxon {taxon}", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return {}
+
+
+def fetch_multiple_genes(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Fetch data for multiple genes by ID.
+
+    Args:
+        gene_ids: List of Gene IDs
+        api_key: Optional NCBI API key
+
+    Returns:
+        Combined gene data as dictionary
+    """
+    # For multiple genes, use POST request
+    url = f"{DATASETS_API_BASE}/id"
+
+    data = json.dumps({"gene_ids": gene_ids}).encode('utf-8')
+    headers = {'Content-Type': 'application/json'}
+
+    if api_key:
+        headers['api-key'] = api_key
+
+    try:
+        req = urllib.request.Request(url, data=data, headers=headers, method='POST')
+        with urllib.request.urlopen(req) as response:
+            return json.loads(response.read().decode())
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return {}
+
+
+def display_gene_info(data: Dict[str, Any], verbose: bool = False) -> None:
+    """
+    Display gene information in human-readable format.
+
+    Args:
+        data: Gene data dictionary from API
+        verbose: Show detailed information
+    """
+    if 'genes' not in data:
+        print("No gene data found in response")
+        return
+
+    for gene in data['genes']:
+        gene_info = gene.get('gene', {})
+
+        print(f"Gene ID: {gene_info.get('gene_id', 'N/A')}")
+        print(f"Symbol: {gene_info.get('symbol', 'N/A')}")
+        print(f"Description: {gene_info.get('description', 'N/A')}")
+
+        if 'tax_name' in gene_info:
+            print(f"Organism: {gene_info['tax_name']}")
+
+        if 'chromosomes' in gene_info:
+            chromosomes = ', '.join(gene_info['chromosomes'])
+            print(f"Chromosome(s): {chromosomes}")
+
+        # Nomenclature
+        if 'nomenclature_authority' in gene_info:
+            auth = gene_info['nomenclature_authority']
+            print(f"Nomenclature: {auth.get('authority', 'N/A')}")
+
+        # Synonyms
+        if 'synonyms' in gene_info and gene_info['synonyms']:
+            print(f"Synonyms: {', '.join(gene_info['synonyms'])}")
+
+        if verbose:
+            # Gene type
+            if 'type' in gene_info:
+                print(f"Type: {gene_info['type']}")
+
+            # Genomic locations
+            if 'genomic_ranges' in gene_info:
+                print("\nGenomic Locations:")
+                for range_info in gene_info['genomic_ranges']:
+                    accession = range_info.get('accession_version', 'N/A')
+                    start = range_info.get('range', [{}])[0].get('begin', 'N/A')
+                    end = range_info.get('range', [{}])[0].get('end', 'N/A')
+                    strand = range_info.get('orientation', 'N/A')
+                    print(f"  {accession}: {start}-{end} ({strand})")
+
+            # Transcripts
+            if 'transcripts' in gene_info:
+                print(f"\nTranscripts: {len(gene_info['transcripts'])}")
+                for transcript in gene_info['transcripts'][:5]:  # Show first 5
+                    print(f"  {transcript.get('accession_version', 'N/A')}")
+
+        print()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Fetch gene data from NCBI Datasets API',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Fetch by Gene ID
+  %(prog)s --gene-id 672
+
+  # Fetch by gene symbol and organism
+  %(prog)s --symbol BRCA1 --taxon human
+
+  # Fetch multiple genes
+  %(prog)s --gene-id 672,7157,5594
+
+  # Get JSON output
+  %(prog)s --symbol TP53 --taxon "Homo sapiens" --output json
+
+  # Verbose output with details
+  %(prog)s --gene-id 672 --verbose
+        """
+    )
+
+    parser.add_argument('--gene-id', '-g', help='Gene ID(s), comma-separated')
+    parser.add_argument('--symbol', '-s', help='Gene symbol')
+    parser.add_argument('--taxon', '-t', help='Organism name or taxon ID (required with --symbol)')
+    parser.add_argument('--output', '-o', choices=['pretty', 'json'], default='pretty',
+                       help='Output format (default: pretty)')
+    parser.add_argument('--verbose', '-v', action='store_true',
+                       help='Show detailed information')
+    parser.add_argument('--api-key', '-k', help='NCBI API key')
+
+    args = parser.parse_args()
+
+    if not args.gene_id and not args.symbol:
+        parser.error("Either --gene-id or --symbol must be provided")
+
+    if args.symbol and not args.taxon:
+        parser.error("--taxon is required when using --symbol")
+
+    # Fetch data
+    if args.gene_id:
+        gene_ids = [id.strip() for id in args.gene_id.split(',')]
+        if len(gene_ids) == 1:
+            data = fetch_gene_by_id(gene_ids[0], api_key=args.api_key)
+        else:
+            data = fetch_multiple_genes(gene_ids, api_key=args.api_key)
+    else:
+        data = fetch_gene_by_symbol(args.symbol, args.taxon, api_key=args.api_key)
+
+    if not data:
+        sys.exit(1)
+
+    # Output
+    if args.output == 'json':
+        print(json.dumps(data, indent=2))
+    else:
+        display_gene_info(data, verbose=args.verbose)
+
+
+if __name__ == '__main__':
+    main()