Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/gene-database/scripts/batch_gene_lookup.py
+++ b/skills/gene-database/scripts/batch_gene_lookup.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""
+Batch gene lookup using NCBI APIs.
+
+This script efficiently processes multiple gene queries with proper
+rate limiting and error handling.
+"""
+
+import argparse
+import json
+import sys
+import time
+import urllib.parse
+import urllib.request
+from typing import Optional, List, Dict, Any
+
+
+def read_gene_list(filepath: str) -> List[str]:
+    """
+    Read gene identifiers from a file (one per line).
+
+    Args:
+        filepath: Path to file containing gene symbols or IDs
+
+    Returns:
+        List of gene identifiers
+    """
+    try:
+        with open(filepath, 'r') as f:
+            genes = [line.strip() for line in f if line.strip()]
+        return genes
+    except FileNotFoundError:
+        print(f"Error: File '{filepath}' not found", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error reading file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def batch_esearch(queries: List[str], organism: Optional[str] = None,
+                  api_key: Optional[str] = None) -> Dict[str, str]:
+    """
+    Search for multiple gene symbols and return their IDs.
+
+    Args:
+        queries: List of gene symbols
+        organism: Optional organism filter
+        api_key: Optional NCBI API key
+
+    Returns:
+        Dictionary mapping gene symbol to Gene ID (or 'NOT_FOUND')
+    """
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+    results = {}
+
+    # Rate limiting
+    delay = 0.1 if api_key else 0.34  # 10 req/sec with key, 3 req/sec without
+
+    for query in queries:
+        # Build search term
+        search_term = f"{query}[gene]"
+        if organism:
+            search_term += f" AND {organism}[organism]"
+
+        params = {
+            'db': 'gene',
+            'term': search_term,
+            'retmax': 1,
+            'retmode': 'json'
+        }
+
+        if api_key:
+            params['api_key'] = api_key
+
+        url = f"{base_url}esearch.fcgi?{urllib.parse.urlencode(params)}"
+
+        try:
+            with urllib.request.urlopen(url) as response:
+                data = json.loads(response.read().decode())
+
+            if 'esearchresult' in data and 'idlist' in data['esearchresult']:
+                id_list = data['esearchresult']['idlist']
+                results[query] = id_list[0] if id_list else 'NOT_FOUND'
+            else:
+                results[query] = 'ERROR'
+
+        except Exception as e:
+            print(f"Error searching for {query}: {e}", file=sys.stderr)
+            results[query] = 'ERROR'
+
+        time.sleep(delay)
+
+    return results
+
+
+def batch_esummary(gene_ids: List[str], api_key: Optional[str] = None,
+                   chunk_size: int = 200) -> Dict[str, Dict[str, Any]]:
+    """
+    Get summaries for multiple genes in batches.
+
+    Args:
+        gene_ids: List of Gene IDs
+        api_key: Optional NCBI API key
+        chunk_size: Number of IDs per request (max 500)
+
+    Returns:
+        Dictionary mapping Gene ID to summary data
+    """
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+    all_results = {}
+
+    # Rate limiting
+    delay = 0.1 if api_key else 0.34
+
+    # Process in chunks
+    for i in range(0, len(gene_ids), chunk_size):
+        chunk = gene_ids[i:i + chunk_size]
+
+        params = {
+            'db': 'gene',
+            'id': ','.join(chunk),
+            'retmode': 'json'
+        }
+
+        if api_key:
+            params['api_key'] = api_key
+
+        url = f"{base_url}esummary.fcgi?{urllib.parse.urlencode(params)}"
+
+        try:
+            with urllib.request.urlopen(url) as response:
+                data = json.loads(response.read().decode())
+
+            if 'result' in data:
+                for gene_id in chunk:
+                    if gene_id in data['result']:
+                        all_results[gene_id] = data['result'][gene_id]
+
+        except Exception as e:
+            print(f"Error fetching summaries for chunk: {e}", file=sys.stderr)
+
+        time.sleep(delay)
+
+    return all_results
+
+
+def batch_lookup_by_ids(gene_ids: List[str], api_key: Optional[str] = None) -> List[Dict[str, Any]]:
+    """
+    Lookup genes by IDs and return structured data.
+
+    Args:
+        gene_ids: List of Gene IDs
+        api_key: Optional NCBI API key
+
+    Returns:
+        List of gene information dictionaries
+    """
+    summaries = batch_esummary(gene_ids, api_key=api_key)
+
+    results = []
+    for gene_id in gene_ids:
+        if gene_id in summaries:
+            gene = summaries[gene_id]
+            results.append({
+                'gene_id': gene_id,
+                'symbol': gene.get('name', 'N/A'),
+                'description': gene.get('description', 'N/A'),
+                'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
+                'chromosome': gene.get('chromosome', 'N/A'),
+                'map_location': gene.get('maplocation', 'N/A'),
+                'type': gene.get('geneticsource', 'N/A')
+            })
+        else:
+            results.append({
+                'gene_id': gene_id,
+                'error': 'Not found or error fetching'
+            })
+
+    return results
+
+
+def batch_lookup_by_symbols(gene_symbols: List[str], organism: str,
+                            api_key: Optional[str] = None) -> List[Dict[str, Any]]:
+    """
+    Lookup genes by symbols and return structured data.
+
+    Args:
+        gene_symbols: List of gene symbols
+        organism: Organism name
+        api_key: Optional NCBI API key
+
+    Returns:
+        List of gene information dictionaries
+    """
+    # First, search for IDs
+    print(f"Searching for {len(gene_symbols)} gene symbols...", file=sys.stderr)
+    symbol_to_id = batch_esearch(gene_symbols, organism=organism, api_key=api_key)
+
+    # Filter to valid IDs
+    valid_ids = [id for id in symbol_to_id.values() if id not in ['NOT_FOUND', 'ERROR']]
+
+    if not valid_ids:
+        print("No genes found", file=sys.stderr)
+        return []
+
+    print(f"Found {len(valid_ids)} genes, fetching details...", file=sys.stderr)
+
+    # Fetch summaries
+    summaries = batch_esummary(valid_ids, api_key=api_key)
+
+    # Build results
+    results = []
+    for symbol, gene_id in symbol_to_id.items():
+        if gene_id == 'NOT_FOUND':
+            results.append({
+                'query_symbol': symbol,
+                'status': 'not_found'
+            })
+        elif gene_id == 'ERROR':
+            results.append({
+                'query_symbol': symbol,
+                'status': 'error'
+            })
+        elif gene_id in summaries:
+            gene = summaries[gene_id]
+            results.append({
+                'query_symbol': symbol,
+                'gene_id': gene_id,
+                'symbol': gene.get('name', 'N/A'),
+                'description': gene.get('description', 'N/A'),
+                'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
+                'chromosome': gene.get('chromosome', 'N/A'),
+                'map_location': gene.get('maplocation', 'N/A'),
+                'type': gene.get('geneticsource', 'N/A')
+            })
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Batch gene lookup using NCBI APIs',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Lookup by gene IDs
+  %(prog)s --ids 672,7157,5594
+
+  # Lookup by symbols from a file
+  %(prog)s --file genes.txt --organism human
+
+  # Lookup with API key and save to file
+  %(prog)s --ids 672,7157,5594 --api-key YOUR_KEY --output results.json
+        """
+    )
+
+    parser.add_argument('--ids', '-i', help='Comma-separated Gene IDs')
+    parser.add_argument('--file', '-f', help='File containing gene symbols (one per line)')
+    parser.add_argument('--organism', '-o', help='Organism name (required with --file)')
+    parser.add_argument('--output', '-O', help='Output file path (JSON format)')
+    parser.add_argument('--api-key', '-k', help='NCBI API key')
+    parser.add_argument('--pretty', '-p', action='store_true',
+                       help='Pretty-print JSON output')
+
+    args = parser.parse_args()
+
+    if not args.ids and not args.file:
+        parser.error("Either --ids or --file must be provided")
+
+    if args.file and not args.organism:
+        parser.error("--organism is required when using --file")
+
+    # Process genes
+    if args.ids:
+        gene_ids = [id.strip() for id in args.ids.split(',')]
+        results = batch_lookup_by_ids(gene_ids, api_key=args.api_key)
+    else:
+        gene_symbols = read_gene_list(args.file)
+        results = batch_lookup_by_symbols(gene_symbols, args.organism, api_key=args.api_key)
+
+    # Output results
+    indent = 2 if args.pretty else None
+    json_output = json.dumps(results, indent=indent)
+
+    if args.output:
+        try:
+            with open(args.output, 'w') as f:
+                f.write(json_output)
+            print(f"Results written to {args.output}", file=sys.stderr)
+        except Exception as e:
+            print(f"Error writing output file: {e}", file=sys.stderr)
+            sys.exit(1)
+    else:
+        print(json_output)
+
+
+if __name__ == '__main__':
+    main()