Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/gene-database/scripts/query_gene.py
+++ b/skills/gene-database/scripts/query_gene.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Query NCBI Gene database using E-utilities.
+
+This script provides access to ESearch, ESummary, and EFetch functions
+for searching and retrieving gene information.
+"""
+
+import argparse
+import json
+import sys
+import time
+import urllib.parse
+import urllib.request
+from typing import Optional, Dict, List, Any
+from xml.etree import ElementTree as ET
+
+
+BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+DB = "gene"
+
+
+def esearch(query: str, retmax: int = 20, api_key: Optional[str] = None) -> List[str]:
+    """
+    Search NCBI Gene database and return list of Gene IDs.
+
+    Args:
+        query: Search query (e.g., "BRCA1[gene] AND human[organism]")
+        retmax: Maximum number of results to return
+        api_key: Optional NCBI API key for higher rate limits
+
+    Returns:
+        List of Gene IDs as strings
+    """
+    params = {
+        'db': DB,
+        'term': query,
+        'retmax': retmax,
+        'retmode': 'json'
+    }
+
+    if api_key:
+        params['api_key'] = api_key
+
+    url = f"{BASE_URL}esearch.fcgi?{urllib.parse.urlencode(params)}"
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            data = json.loads(response.read().decode())
+
+        if 'esearchresult' in data and 'idlist' in data['esearchresult']:
+            return data['esearchresult']['idlist']
+        else:
+            print(f"Error: Unexpected response format", file=sys.stderr)
+            return []
+
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        return []
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return []
+
+
+def esummary(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Get document summaries for Gene IDs.
+
+    Args:
+        gene_ids: List of Gene IDs
+        api_key: Optional NCBI API key
+
+    Returns:
+        Dictionary of gene summaries
+    """
+    params = {
+        'db': DB,
+        'id': ','.join(gene_ids),
+        'retmode': 'json'
+    }
+
+    if api_key:
+        params['api_key'] = api_key
+
+    url = f"{BASE_URL}esummary.fcgi?{urllib.parse.urlencode(params)}"
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            data = json.loads(response.read().decode())
+        return data
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return {}
+
+
+def efetch(gene_ids: List[str], retmode: str = 'xml', api_key: Optional[str] = None) -> str:
+    """
+    Fetch full gene records.
+
+    Args:
+        gene_ids: List of Gene IDs
+        retmode: Return format ('xml', 'text', 'asn.1')
+        api_key: Optional NCBI API key
+
+    Returns:
+        Gene records as string in requested format
+    """
+    params = {
+        'db': DB,
+        'id': ','.join(gene_ids),
+        'retmode': retmode
+    }
+
+    if api_key:
+        params['api_key'] = api_key
+
+    url = f"{BASE_URL}efetch.fcgi?{urllib.parse.urlencode(params)}"
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read().decode()
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        return ""
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return ""
+
+
+def search_and_summarize(query: str, organism: Optional[str] = None,
+                        max_results: int = 20, api_key: Optional[str] = None) -> None:
+    """
+    Search for genes and display summaries.
+
+    Args:
+        query: Gene search query
+        organism: Optional organism filter
+        max_results: Maximum number of results
+        api_key: Optional NCBI API key
+    """
+    # Add organism filter if provided
+    if organism:
+        if '[organism]' not in query.lower():
+            query = f"{query} AND {organism}[organism]"
+
+    print(f"Searching for: {query}")
+    print("-" * 80)
+
+    # Search for gene IDs
+    gene_ids = esearch(query, retmax=max_results, api_key=api_key)
+
+    if not gene_ids:
+        print("No results found.")
+        return
+
+    print(f"Found {len(gene_ids)} gene(s)")
+    print()
+
+    # Get summaries
+    summaries = esummary(gene_ids, api_key=api_key)
+
+    if 'result' in summaries:
+        for gene_id in gene_ids:
+            if gene_id in summaries['result']:
+                gene = summaries['result'][gene_id]
+                print(f"Gene ID: {gene_id}")
+                print(f"  Symbol: {gene.get('name', 'N/A')}")
+                print(f"  Description: {gene.get('description', 'N/A')}")
+                print(f"  Organism: {gene.get('organism', {}).get('scientificname', 'N/A')}")
+                print(f"  Chromosome: {gene.get('chromosome', 'N/A')}")
+                print(f"  Map Location: {gene.get('maplocation', 'N/A')}")
+                print(f"  Type: {gene.get('geneticsource', 'N/A')}")
+                print()
+
+    # Respect rate limits
+    time.sleep(0.34)  # ~3 requests per second
+
+
+def fetch_by_id(gene_ids: List[str], output_format: str = 'json',
+                api_key: Optional[str] = None) -> None:
+    """
+    Fetch and display gene information by ID.
+
+    Args:
+        gene_ids: List of Gene IDs
+        output_format: Output format ('json', 'xml', 'text')
+        api_key: Optional NCBI API key
+    """
+    if output_format == 'json':
+        # Get summaries in JSON format
+        summaries = esummary(gene_ids, api_key=api_key)
+        print(json.dumps(summaries, indent=2))
+    else:
+        # Fetch full records
+        data = efetch(gene_ids, retmode=output_format, api_key=api_key)
+        print(data)
+
+    # Respect rate limits
+    time.sleep(0.34)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Query NCBI Gene database using E-utilities',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Search for gene by symbol
+  %(prog)s --search "BRCA1" --organism "human"
+
+  # Fetch gene by ID
+  %(prog)s --id 672 --format json
+
+  # Complex search query
+  %(prog)s --search "insulin[gene] AND diabetes[disease]"
+
+  # Multiple gene IDs
+  %(prog)s --id 672,7157,5594
+        """
+    )
+
+    parser.add_argument('--search', '-s', help='Search query')
+    parser.add_argument('--organism', '-o', help='Organism filter')
+    parser.add_argument('--id', '-i', help='Gene ID(s), comma-separated')
+    parser.add_argument('--format', '-f', default='json',
+                       choices=['json', 'xml', 'text'],
+                       help='Output format (default: json)')
+    parser.add_argument('--max-results', '-m', type=int, default=20,
+                       help='Maximum number of search results (default: 20)')
+    parser.add_argument('--api-key', '-k', help='NCBI API key for higher rate limits')
+
+    args = parser.parse_args()
+
+    if not args.search and not args.id:
+        parser.error("Either --search or --id must be provided")
+
+    if args.id:
+        # Fetch by ID
+        gene_ids = [id.strip() for id in args.id.split(',')]
+        fetch_by_id(gene_ids, output_format=args.format, api_key=args.api_key)
+    else:
+        # Search and summarize
+        search_and_summarize(args.search, organism=args.organism,
+                           max_results=args.max_results, api_key=args.api_key)
+
+
+if __name__ == '__main__':
+    main()