Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/gene-database/scripts/batch_gene_lookup.py
+++ b/skills/gene-database/scripts/batch_gene_lookup.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+"""
+Batch gene lookup using NCBI APIs.
+
+This script efficiently processes multiple gene queries with proper
+rate limiting and error handling.
+"""
+
+import argparse
+import json
+import sys
+import time
+import urllib.parse
+import urllib.request
+from typing import Optional, List, Dict, Any
+
+
+def read_gene_list(filepath: str) -> List[str]:
+    """
+    Read gene identifiers from a file (one per line).
+
+    Args:
+        filepath: Path to file containing gene symbols or IDs
+
+    Returns:
+        List of gene identifiers
+    """
+    try:
+        with open(filepath, 'r') as f:
+            genes = [line.strip() for line in f if line.strip()]
+        return genes
+    except FileNotFoundError:
+        print(f"Error: File '{filepath}' not found", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error reading file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def batch_esearch(queries: List[str], organism: Optional[str] = None,
+                  api_key: Optional[str] = None) -> Dict[str, str]:
+    """
+    Search for multiple gene symbols and return their IDs.
+
+    Args:
+        queries: List of gene symbols
+        organism: Optional organism filter
+        api_key: Optional NCBI API key
+
+    Returns:
+        Dictionary mapping gene symbol to Gene ID (or 'NOT_FOUND')
+    """
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+    results = {}
+
+    # Rate limiting
+    delay = 0.1 if api_key else 0.34  # 10 req/sec with key, 3 req/sec without
+
+    for query in queries:
+        # Build search term
+        search_term = f"{query}[gene]"
+        if organism:
+            search_term += f" AND {organism}[organism]"
+
+        params = {
+            'db': 'gene',
+            'term': search_term,
+            'retmax': 1,
+            'retmode': 'json'
+        }
+
+        if api_key:
+            params['api_key'] = api_key
+
+        url = f"{base_url}esearch.fcgi?{urllib.parse.urlencode(params)}"
+
+        try:
+            with urllib.request.urlopen(url) as response:
+                data = json.loads(response.read().decode())
+
+            if 'esearchresult' in data and 'idlist' in data['esearchresult']:
+                id_list = data['esearchresult']['idlist']
+                results[query] = id_list[0] if id_list else 'NOT_FOUND'
+            else:
+                results[query] = 'ERROR'
+
+        except Exception as e:
+            print(f"Error searching for {query}: {e}", file=sys.stderr)
+            results[query] = 'ERROR'
+
+        time.sleep(delay)
+
+    return results
+
+
+def batch_esummary(gene_ids: List[str], api_key: Optional[str] = None,
+                   chunk_size: int = 200) -> Dict[str, Dict[str, Any]]:
+    """
+    Get summaries for multiple genes in batches.
+
+    Args:
+        gene_ids: List of Gene IDs
+        api_key: Optional NCBI API key
+        chunk_size: Number of IDs per request (max 500)
+
+    Returns:
+        Dictionary mapping Gene ID to summary data
+    """
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+    all_results = {}
+
+    # Rate limiting
+    delay = 0.1 if api_key else 0.34
+
+    # Process in chunks
+    for i in range(0, len(gene_ids), chunk_size):
+        chunk = gene_ids[i:i + chunk_size]
+
+        params = {
+            'db': 'gene',
+            'id': ','.join(chunk),
+            'retmode': 'json'
+        }
+
+        if api_key:
+            params['api_key'] = api_key
+
+        url = f"{base_url}esummary.fcgi?{urllib.parse.urlencode(params)}"
+
+        try:
+            with urllib.request.urlopen(url) as response:
+                data = json.loads(response.read().decode())
+
+            if 'result' in data:
+                for gene_id in chunk:
+                    if gene_id in data['result']:
+                        all_results[gene_id] = data['result'][gene_id]
+
+        except Exception as e:
+            print(f"Error fetching summaries for chunk: {e}", file=sys.stderr)
+
+        time.sleep(delay)
+
+    return all_results
+
+
+def batch_lookup_by_ids(gene_ids: List[str], api_key: Optional[str] = None) -> List[Dict[str, Any]]:
+    """
+    Lookup genes by IDs and return structured data.
+
+    Args:
+        gene_ids: List of Gene IDs
+        api_key: Optional NCBI API key
+
+    Returns:
+        List of gene information dictionaries
+    """
+    summaries = batch_esummary(gene_ids, api_key=api_key)
+
+    results = []
+    for gene_id in gene_ids:
+        if gene_id in summaries:
+            gene = summaries[gene_id]
+            results.append({
+                'gene_id': gene_id,
+                'symbol': gene.get('name', 'N/A'),
+                'description': gene.get('description', 'N/A'),
+                'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
+                'chromosome': gene.get('chromosome', 'N/A'),
+                'map_location': gene.get('maplocation', 'N/A'),
+                'type': gene.get('geneticsource', 'N/A')
+            })
+        else:
+            results.append({
+                'gene_id': gene_id,
+                'error': 'Not found or error fetching'
+            })
+
+    return results
+
+
+def batch_lookup_by_symbols(gene_symbols: List[str], organism: str,
+                            api_key: Optional[str] = None) -> List[Dict[str, Any]]:
+    """
+    Lookup genes by symbols and return structured data.
+
+    Args:
+        gene_symbols: List of gene symbols
+        organism: Organism name
+        api_key: Optional NCBI API key
+
+    Returns:
+        List of gene information dictionaries
+    """
+    # First, search for IDs
+    print(f"Searching for {len(gene_symbols)} gene symbols...", file=sys.stderr)
+    symbol_to_id = batch_esearch(gene_symbols, organism=organism, api_key=api_key)
+
+    # Filter to valid IDs
+    valid_ids = [id for id in symbol_to_id.values() if id not in ['NOT_FOUND', 'ERROR']]
+
+    if not valid_ids:
+        print("No genes found", file=sys.stderr)
+        return []
+
+    print(f"Found {len(valid_ids)} genes, fetching details...", file=sys.stderr)
+
+    # Fetch summaries
+    summaries = batch_esummary(valid_ids, api_key=api_key)
+
+    # Build results
+    results = []
+    for symbol, gene_id in symbol_to_id.items():
+        if gene_id == 'NOT_FOUND':
+            results.append({
+                'query_symbol': symbol,
+                'status': 'not_found'
+            })
+        elif gene_id == 'ERROR':
+            results.append({
+                'query_symbol': symbol,
+                'status': 'error'
+            })
+        elif gene_id in summaries:
+            gene = summaries[gene_id]
+            results.append({
+                'query_symbol': symbol,
+                'gene_id': gene_id,
+                'symbol': gene.get('name', 'N/A'),
+                'description': gene.get('description', 'N/A'),
+                'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
+                'chromosome': gene.get('chromosome', 'N/A'),
+                'map_location': gene.get('maplocation', 'N/A'),
+                'type': gene.get('geneticsource', 'N/A')
+            })
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Batch gene lookup using NCBI APIs',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Lookup by gene IDs
+  %(prog)s --ids 672,7157,5594
+
+  # Lookup by symbols from a file
+  %(prog)s --file genes.txt --organism human
+
+  # Lookup with API key and save to file
+  %(prog)s --ids 672,7157,5594 --api-key YOUR_KEY --output results.json
+        """
+    )
+
+    parser.add_argument('--ids', '-i', help='Comma-separated Gene IDs')
+    parser.add_argument('--file', '-f', help='File containing gene symbols (one per line)')
+    parser.add_argument('--organism', '-o', help='Organism name (required with --file)')
+    parser.add_argument('--output', '-O', help='Output file path (JSON format)')
+    parser.add_argument('--api-key', '-k', help='NCBI API key')
+    parser.add_argument('--pretty', '-p', action='store_true',
+                       help='Pretty-print JSON output')
+
+    args = parser.parse_args()
+
+    if not args.ids and not args.file:
+        parser.error("Either --ids or --file must be provided")
+
+    if args.file and not args.organism:
+        parser.error("--organism is required when using --file")
+
+    # Process genes
+    if args.ids:
+        gene_ids = [id.strip() for id in args.ids.split(',')]
+        results = batch_lookup_by_ids(gene_ids, api_key=args.api_key)
+    else:
+        gene_symbols = read_gene_list(args.file)
+        results = batch_lookup_by_symbols(gene_symbols, args.organism, api_key=args.api_key)
+
+    # Output results
+    indent = 2 if args.pretty else None
+    json_output = json.dumps(results, indent=indent)
+
+    if args.output:
+        try:
+            with open(args.output, 'w') as f:
+                f.write(json_output)
+            print(f"Results written to {args.output}", file=sys.stderr)
+        except Exception as e:
+            print(f"Error writing output file: {e}", file=sys.stderr)
+            sys.exit(1)
+    else:
+        print(json_output)
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/gene-database/scripts/fetch_gene_data.py
+++ b/skills/gene-database/scripts/fetch_gene_data.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""
+Fetch gene data from NCBI using the Datasets API.
+
+This script provides access to the NCBI Datasets API for retrieving
+comprehensive gene information including metadata and sequences.
+"""
+
+import argparse
+import json
+import sys
+import urllib.parse
+import urllib.request
+from typing import Optional, Dict, Any, List
+
+
+DATASETS_API_BASE = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/gene"
+
+
+def get_taxon_id(taxon_name: str) -> Optional[str]:
+    """
+    Convert taxon name to NCBI taxon ID.
+
+    Args:
+        taxon_name: Common or scientific name (e.g., "human", "Homo sapiens")
+
+    Returns:
+        Taxon ID as string, or None if not found
+    """
+    # Common mappings
+    common_taxa = {
+        'human': '9606',
+        'homo sapiens': '9606',
+        'mouse': '10090',
+        'mus musculus': '10090',
+        'rat': '10116',
+        'rattus norvegicus': '10116',
+        'zebrafish': '7955',
+        'danio rerio': '7955',
+        'fruit fly': '7227',
+        'drosophila melanogaster': '7227',
+        'c. elegans': '6239',
+        'caenorhabditis elegans': '6239',
+        'yeast': '4932',
+        'saccharomyces cerevisiae': '4932',
+        'arabidopsis': '3702',
+        'arabidopsis thaliana': '3702',
+        'e. coli': '562',
+        'escherichia coli': '562',
+    }
+
+    taxon_lower = taxon_name.lower().strip()
+    return common_taxa.get(taxon_lower)
+
+
+def fetch_gene_by_id(gene_id: str, api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Fetch gene data by Gene ID.
+
+    Args:
+        gene_id: NCBI Gene ID
+        api_key: Optional NCBI API key
+
+    Returns:
+        Gene data as dictionary
+    """
+    url = f"{DATASETS_API_BASE}/id/{gene_id}"
+
+    headers = {}
+    if api_key:
+        headers['api-key'] = api_key
+
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req) as response:
+            return json.loads(response.read().decode())
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        if e.code == 404:
+            print(f"Gene ID {gene_id} not found", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return {}
+
+
+def fetch_gene_by_symbol(symbol: str, taxon: str, api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Fetch gene data by gene symbol and taxon.
+
+    Args:
+        symbol: Gene symbol (e.g., "BRCA1")
+        taxon: Organism name or taxon ID
+        api_key: Optional NCBI API key
+
+    Returns:
+        Gene data as dictionary
+    """
+    # Convert taxon name to ID if needed
+    taxon_id = get_taxon_id(taxon)
+    if not taxon_id:
+        # Try to use as-is (might already be a taxon ID)
+        taxon_id = taxon
+
+    url = f"{DATASETS_API_BASE}/symbol/{symbol}/taxon/{taxon_id}"
+
+    headers = {}
+    if api_key:
+        headers['api-key'] = api_key
+
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req) as response:
+            return json.loads(response.read().decode())
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        if e.code == 404:
+            print(f"Gene symbol '{symbol}' not found for taxon {taxon}", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return {}
+
+
+def fetch_multiple_genes(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Fetch data for multiple genes by ID.
+
+    Args:
+        gene_ids: List of Gene IDs
+        api_key: Optional NCBI API key
+
+    Returns:
+        Combined gene data as dictionary
+    """
+    # For multiple genes, use POST request
+    url = f"{DATASETS_API_BASE}/id"
+
+    data = json.dumps({"gene_ids": gene_ids}).encode('utf-8')
+    headers = {'Content-Type': 'application/json'}
+
+    if api_key:
+        headers['api-key'] = api_key
+
+    try:
+        req = urllib.request.Request(url, data=data, headers=headers, method='POST')
+        with urllib.request.urlopen(req) as response:
+            return json.loads(response.read().decode())
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return {}
+
+
+def display_gene_info(data: Dict[str, Any], verbose: bool = False) -> None:
+    """
+    Display gene information in human-readable format.
+
+    Args:
+        data: Gene data dictionary from API
+        verbose: Show detailed information
+    """
+    if 'genes' not in data:
+        print("No gene data found in response")
+        return
+
+    for gene in data['genes']:
+        gene_info = gene.get('gene', {})
+
+        print(f"Gene ID: {gene_info.get('gene_id', 'N/A')}")
+        print(f"Symbol: {gene_info.get('symbol', 'N/A')}")
+        print(f"Description: {gene_info.get('description', 'N/A')}")
+
+        if 'tax_name' in gene_info:
+            print(f"Organism: {gene_info['tax_name']}")
+
+        if 'chromosomes' in gene_info:
+            chromosomes = ', '.join(gene_info['chromosomes'])
+            print(f"Chromosome(s): {chromosomes}")
+
+        # Nomenclature
+        if 'nomenclature_authority' in gene_info:
+            auth = gene_info['nomenclature_authority']
+            print(f"Nomenclature: {auth.get('authority', 'N/A')}")
+
+        # Synonyms
+        if 'synonyms' in gene_info and gene_info['synonyms']:
+            print(f"Synonyms: {', '.join(gene_info['synonyms'])}")
+
+        if verbose:
+            # Gene type
+            if 'type' in gene_info:
+                print(f"Type: {gene_info['type']}")
+
+            # Genomic locations
+            if 'genomic_ranges' in gene_info:
+                print("\nGenomic Locations:")
+                for range_info in gene_info['genomic_ranges']:
+                    accession = range_info.get('accession_version', 'N/A')
+                    start = range_info.get('range', [{}])[0].get('begin', 'N/A')
+                    end = range_info.get('range', [{}])[0].get('end', 'N/A')
+                    strand = range_info.get('orientation', 'N/A')
+                    print(f"  {accession}: {start}-{end} ({strand})")
+
+            # Transcripts
+            if 'transcripts' in gene_info:
+                print(f"\nTranscripts: {len(gene_info['transcripts'])}")
+                for transcript in gene_info['transcripts'][:5]:  # Show first 5
+                    print(f"  {transcript.get('accession_version', 'N/A')}")
+
+        print()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Fetch gene data from NCBI Datasets API',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Fetch by Gene ID
+  %(prog)s --gene-id 672
+
+  # Fetch by gene symbol and organism
+  %(prog)s --symbol BRCA1 --taxon human
+
+  # Fetch multiple genes
+  %(prog)s --gene-id 672,7157,5594
+
+  # Get JSON output
+  %(prog)s --symbol TP53 --taxon "Homo sapiens" --output json
+
+  # Verbose output with details
+  %(prog)s --gene-id 672 --verbose
+        """
+    )
+
+    parser.add_argument('--gene-id', '-g', help='Gene ID(s), comma-separated')
+    parser.add_argument('--symbol', '-s', help='Gene symbol')
+    parser.add_argument('--taxon', '-t', help='Organism name or taxon ID (required with --symbol)')
+    parser.add_argument('--output', '-o', choices=['pretty', 'json'], default='pretty',
+                       help='Output format (default: pretty)')
+    parser.add_argument('--verbose', '-v', action='store_true',
+                       help='Show detailed information')
+    parser.add_argument('--api-key', '-k', help='NCBI API key')
+
+    args = parser.parse_args()
+
+    if not args.gene_id and not args.symbol:
+        parser.error("Either --gene-id or --symbol must be provided")
+
+    if args.symbol and not args.taxon:
+        parser.error("--taxon is required when using --symbol")
+
+    # Fetch data
+    if args.gene_id:
+        gene_ids = [id.strip() for id in args.gene_id.split(',')]
+        if len(gene_ids) == 1:
+            data = fetch_gene_by_id(gene_ids[0], api_key=args.api_key)
+        else:
+            data = fetch_multiple_genes(gene_ids, api_key=args.api_key)
+    else:
+        data = fetch_gene_by_symbol(args.symbol, args.taxon, api_key=args.api_key)
+
+    if not data:
+        sys.exit(1)
+
+    # Output
+    if args.output == 'json':
+        print(json.dumps(data, indent=2))
+    else:
+        display_gene_info(data, verbose=args.verbose)
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/gene-database/scripts/query_gene.py
+++ b/skills/gene-database/scripts/query_gene.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Query NCBI Gene database using E-utilities.
+
+This script provides access to ESearch, ESummary, and EFetch functions
+for searching and retrieving gene information.
+"""
+
+import argparse
+import json
+import sys
+import time
+import urllib.parse
+import urllib.request
+from typing import Optional, Dict, List, Any
+from xml.etree import ElementTree as ET
+
+
+BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+DB = "gene"
+
+
+def esearch(query: str, retmax: int = 20, api_key: Optional[str] = None) -> List[str]:
+    """
+    Search NCBI Gene database and return list of Gene IDs.
+
+    Args:
+        query: Search query (e.g., "BRCA1[gene] AND human[organism]")
+        retmax: Maximum number of results to return
+        api_key: Optional NCBI API key for higher rate limits
+
+    Returns:
+        List of Gene IDs as strings
+    """
+    params = {
+        'db': DB,
+        'term': query,
+        'retmax': retmax,
+        'retmode': 'json'
+    }
+
+    if api_key:
+        params['api_key'] = api_key
+
+    url = f"{BASE_URL}esearch.fcgi?{urllib.parse.urlencode(params)}"
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            data = json.loads(response.read().decode())
+
+        if 'esearchresult' in data and 'idlist' in data['esearchresult']:
+            return data['esearchresult']['idlist']
+        else:
+            print(f"Error: Unexpected response format", file=sys.stderr)
+            return []
+
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        return []
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return []
+
+
+def esummary(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Get document summaries for Gene IDs.
+
+    Args:
+        gene_ids: List of Gene IDs
+        api_key: Optional NCBI API key
+
+    Returns:
+        Dictionary of gene summaries
+    """
+    params = {
+        'db': DB,
+        'id': ','.join(gene_ids),
+        'retmode': 'json'
+    }
+
+    if api_key:
+        params['api_key'] = api_key
+
+    url = f"{BASE_URL}esummary.fcgi?{urllib.parse.urlencode(params)}"
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            data = json.loads(response.read().decode())
+        return data
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return {}
+
+
+def efetch(gene_ids: List[str], retmode: str = 'xml', api_key: Optional[str] = None) -> str:
+    """
+    Fetch full gene records.
+
+    Args:
+        gene_ids: List of Gene IDs
+        retmode: Return format ('xml', 'text', 'asn.1')
+        api_key: Optional NCBI API key
+
+    Returns:
+        Gene records as string in requested format
+    """
+    params = {
+        'db': DB,
+        'id': ','.join(gene_ids),
+        'retmode': retmode
+    }
+
+    if api_key:
+        params['api_key'] = api_key
+
+    url = f"{BASE_URL}efetch.fcgi?{urllib.parse.urlencode(params)}"
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read().decode()
+    except urllib.error.HTTPError as e:
+        print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
+        return ""
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return ""
+
+
+def search_and_summarize(query: str, organism: Optional[str] = None,
+                        max_results: int = 20, api_key: Optional[str] = None) -> None:
+    """
+    Search for genes and display summaries.
+
+    Args:
+        query: Gene search query
+        organism: Optional organism filter
+        max_results: Maximum number of results
+        api_key: Optional NCBI API key
+    """
+    # Add organism filter if provided
+    if organism:
+        if '[organism]' not in query.lower():
+            query = f"{query} AND {organism}[organism]"
+
+    print(f"Searching for: {query}")
+    print("-" * 80)
+
+    # Search for gene IDs
+    gene_ids = esearch(query, retmax=max_results, api_key=api_key)
+
+    if not gene_ids:
+        print("No results found.")
+        return
+
+    print(f"Found {len(gene_ids)} gene(s)")
+    print()
+
+    # Get summaries
+    summaries = esummary(gene_ids, api_key=api_key)
+
+    if 'result' in summaries:
+        for gene_id in gene_ids:
+            if gene_id in summaries['result']:
+                gene = summaries['result'][gene_id]
+                print(f"Gene ID: {gene_id}")
+                print(f"  Symbol: {gene.get('name', 'N/A')}")
+                print(f"  Description: {gene.get('description', 'N/A')}")
+                print(f"  Organism: {gene.get('organism', {}).get('scientificname', 'N/A')}")
+                print(f"  Chromosome: {gene.get('chromosome', 'N/A')}")
+                print(f"  Map Location: {gene.get('maplocation', 'N/A')}")
+                print(f"  Type: {gene.get('geneticsource', 'N/A')}")
+                print()
+
+    # Respect rate limits
+    time.sleep(0.34)  # ~3 requests per second
+
+
+def fetch_by_id(gene_ids: List[str], output_format: str = 'json',
+                api_key: Optional[str] = None) -> None:
+    """
+    Fetch and display gene information by ID.
+
+    Args:
+        gene_ids: List of Gene IDs
+        output_format: Output format ('json', 'xml', 'text')
+        api_key: Optional NCBI API key
+    """
+    if output_format == 'json':
+        # Get summaries in JSON format
+        summaries = esummary(gene_ids, api_key=api_key)
+        print(json.dumps(summaries, indent=2))
+    else:
+        # Fetch full records
+        data = efetch(gene_ids, retmode=output_format, api_key=api_key)
+        print(data)
+
+    # Respect rate limits
+    time.sleep(0.34)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Query NCBI Gene database using E-utilities',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Search for gene by symbol
+  %(prog)s --search "BRCA1" --organism "human"
+
+  # Fetch gene by ID
+  %(prog)s --id 672 --format json
+
+  # Complex search query
+  %(prog)s --search "insulin[gene] AND diabetes[disease]"
+
+  # Multiple gene IDs
+  %(prog)s --id 672,7157,5594
+        """
+    )
+
+    parser.add_argument('--search', '-s', help='Search query')
+    parser.add_argument('--organism', '-o', help='Organism filter')
+    parser.add_argument('--id', '-i', help='Gene ID(s), comma-separated')
+    parser.add_argument('--format', '-f', default='json',
+                       choices=['json', 'xml', 'text'],
+                       help='Output format (default: json)')
+    parser.add_argument('--max-results', '-m', type=int, default=20,
+                       help='Maximum number of search results (default: 20)')
+    parser.add_argument('--api-key', '-k', help='NCBI API key for higher rate limits')
+
+    args = parser.parse_args()
+
+    if not args.search and not args.id:
+        parser.error("Either --search or --id must be provided")
+
+    if args.id:
+        # Fetch by ID
+        gene_ids = [id.strip() for id in args.id.split(',')]
+        fetch_by_id(gene_ids, output_format=args.format, api_key=args.api_key)
+    else:
+        # Search and summarize
+        search_and_summarize(args.search, organism=args.organism,
+                           max_results=args.max_results, api_key=args.api_key)
+
+
+if __name__ == '__main__':
+    main()