Initial commit

2025-11-30 08:30:18 +08:00
commit 74bee324ab
335 changed files with 147377 additions and 0 deletions
--- a/skills/citation-management/scripts/search_pubmed.py
+++ b/skills/citation-management/scripts/search_pubmed.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python3
+"""
+PubMed Search Tool
+Search PubMed using E-utilities API and export results.
+"""
+
+import sys
+import os
+import requests
+import argparse
+import json
+import time
+import xml.etree.ElementTree as ET
+from typing import List, Dict, Optional
+from datetime import datetime
+
+class PubMedSearcher:
+    """Search PubMed using NCBI E-utilities API."""
+    
+    def __init__(self, api_key: Optional[str] = None, email: Optional[str] = None):
+        """
+        Initialize searcher.
+        
+        Args:
+            api_key: NCBI API key (optional but recommended)
+            email: Email for Entrez (optional but recommended)
+        """
+        self.api_key = api_key or os.getenv('NCBI_API_KEY', '')
+        self.email = email or os.getenv('NCBI_EMAIL', '')
+        self.base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
+        self.session = requests.Session()
+        
+        # Rate limiting
+        self.delay = 0.11 if self.api_key else 0.34  # 10/sec with key, 3/sec without
+    
+    def search(self, query: str, max_results: int = 100,
+               date_start: Optional[str] = None, date_end: Optional[str] = None,
+               publication_types: Optional[List[str]] = None) -> List[str]:
+        """
+        Search PubMed and return PMIDs.
+        
+        Args:
+            query: Search query
+            max_results: Maximum number of results
+            date_start: Start date (YYYY/MM/DD or YYYY)
+            date_end: End date (YYYY/MM/DD or YYYY)
+            publication_types: List of publication types to filter
+            
+        Returns:
+            List of PMIDs
+        """
+        # Build query with filters
+        full_query = query
+        
+        # Add date range
+        if date_start or date_end:
+            start = date_start or '1900'
+            end = date_end or datetime.now().strftime('%Y')
+            full_query += f' AND {start}:{end}[Publication Date]'
+        
+        # Add publication types
+        if publication_types:
+            pub_type_query = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
+            full_query += f' AND ({pub_type_query})'
+        
+        print(f'Searching PubMed: {full_query}', file=sys.stderr)
+        
+        # ESearch to get PMIDs
+        esearch_url = self.base_url + 'esearch.fcgi'
+        params = {
+            'db': 'pubmed',
+            'term': full_query,
+            'retmax': max_results,
+            'retmode': 'json'
+        }
+        
+        if self.email:
+            params['email'] = self.email
+        if self.api_key:
+            params['api_key'] = self.api_key
+        
+        try:
+            response = self.session.get(esearch_url, params=params, timeout=30)
+            response.raise_for_status()
+            
+            data = response.json()
+            pmids = data['esearchresult']['idlist']
+            count = int(data['esearchresult']['count'])
+            
+            print(f'Found {count} results, retrieving {len(pmids)}', file=sys.stderr)
+            
+            return pmids
+            
+        except Exception as e:
+            print(f'Error searching PubMed: {e}', file=sys.stderr)
+            return []
+    
+    def fetch_metadata(self, pmids: List[str]) -> List[Dict]:
+        """
+        Fetch metadata for PMIDs.
+        
+        Args:
+            pmids: List of PubMed IDs
+            
+        Returns:
+            List of metadata dictionaries
+        """
+        if not pmids:
+            return []
+        
+        metadata_list = []
+        
+        # Fetch in batches of 200
+        batch_size = 200
+        for i in range(0, len(pmids), batch_size):
+            batch = pmids[i:i+batch_size]
+            print(f'Fetching metadata for PMIDs {i+1}-{min(i+batch_size, len(pmids))}...', file=sys.stderr)
+            
+            efetch_url = self.base_url + 'efetch.fcgi'
+            params = {
+                'db': 'pubmed',
+                'id': ','.join(batch),
+                'retmode': 'xml',
+                'rettype': 'abstract'
+            }
+            
+            if self.email:
+                params['email'] = self.email
+            if self.api_key:
+                params['api_key'] = self.api_key
+            
+            try:
+                response = self.session.get(efetch_url, params=params, timeout=60)
+                response.raise_for_status()
+                
+                # Parse XML
+                root = ET.fromstring(response.content)
+                articles = root.findall('.//PubmedArticle')
+                
+                for article in articles:
+                    metadata = self._extract_metadata_from_xml(article)
+                    if metadata:
+                        metadata_list.append(metadata)
+                
+                # Rate limiting
+                time.sleep(self.delay)
+                
+            except Exception as e:
+                print(f'Error fetching metadata for batch: {e}', file=sys.stderr)
+                continue
+        
+        return metadata_list
+    
+    def _extract_metadata_from_xml(self, article: ET.Element) -> Optional[Dict]:
+        """Extract metadata from PubmedArticle XML element."""
+        try:
+            medline_citation = article.find('.//MedlineCitation')
+            article_elem = medline_citation.find('.//Article')
+            journal = article_elem.find('.//Journal')
+            
+            # Get PMID
+            pmid = medline_citation.findtext('.//PMID', '')
+            
+            # Get DOI
+            doi = None
+            article_ids = article.findall('.//ArticleId')
+            for article_id in article_ids:
+                if article_id.get('IdType') == 'doi':
+                    doi = article_id.text
+                    break
+            
+            # Get authors
+            authors = []
+            author_list = article_elem.find('.//AuthorList')
+            if author_list is not None:
+                for author in author_list.findall('.//Author'):
+                    last_name = author.findtext('.//LastName', '')
+                    fore_name = author.findtext('.//ForeName', '')
+                    if last_name:
+                        if fore_name:
+                            authors.append(f'{last_name}, {fore_name}')
+                        else:
+                            authors.append(last_name)
+            
+            # Get year
+            year = article_elem.findtext('.//Journal/JournalIssue/PubDate/Year', '')
+            if not year:
+                medline_date = article_elem.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
+                if medline_date:
+                    import re
+                    year_match = re.search(r'\d{4}', medline_date)
+                    if year_match:
+                        year = year_match.group()
+            
+            metadata = {
+                'pmid': pmid,
+                'doi': doi,
+                'title': article_elem.findtext('.//ArticleTitle', ''),
+                'authors': ' and '.join(authors),
+                'journal': journal.findtext('.//Title', ''),
+                'year': year,
+                'volume': journal.findtext('.//JournalIssue/Volume', ''),
+                'issue': journal.findtext('.//JournalIssue/Issue', ''),
+                'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
+                'abstract': article_elem.findtext('.//Abstract/AbstractText', '')
+            }
+            
+            return metadata
+            
+        except Exception as e:
+            print(f'Error extracting metadata: {e}', file=sys.stderr)
+            return None
+    
+    def metadata_to_bibtex(self, metadata: Dict) -> str:
+        """Convert metadata to BibTeX format."""
+        # Generate citation key
+        if metadata.get('authors'):
+            first_author = metadata['authors'].split(' and ')[0]
+            if ',' in first_author:
+                last_name = first_author.split(',')[0].strip()
+            else:
+                last_name = first_author.split()[0]
+        else:
+            last_name = 'Unknown'
+        
+        year = metadata.get('year', 'XXXX')
+        citation_key = f'{last_name}{year}pmid{metadata.get("pmid", "")}'
+        
+        # Build BibTeX entry
+        lines = [f'@article{{{citation_key},']
+        
+        if metadata.get('authors'):
+            lines.append(f'  author  = {{{metadata["authors"]}}},')
+        
+        if metadata.get('title'):
+            lines.append(f'  title   = {{{metadata["title"]}}},')
+        
+        if metadata.get('journal'):
+            lines.append(f'  journal = {{{metadata["journal"]}}},')
+        
+        if metadata.get('year'):
+            lines.append(f'  year    = {{{metadata["year"]}}},')
+        
+        if metadata.get('volume'):
+            lines.append(f'  volume  = {{{metadata["volume"]}}},')
+        
+        if metadata.get('issue'):
+            lines.append(f'  number  = {{{metadata["issue"]}}},')
+        
+        if metadata.get('pages'):
+            pages = metadata['pages'].replace('-', '--')
+            lines.append(f'  pages   = {{{pages}}},')
+        
+        if metadata.get('doi'):
+            lines.append(f'  doi     = {{{metadata["doi"]}}},')
+        
+        if metadata.get('pmid'):
+            lines.append(f'  note    = {{PMID: {metadata["pmid"]}}},')
+        
+        # Remove trailing comma
+        if lines[-1].endswith(','):
+            lines[-1] = lines[-1][:-1]
+        
+        lines.append('}')
+        
+        return '\n'.join(lines)
+
+
+def main():
+    """Command-line interface."""
+    parser = argparse.ArgumentParser(
+        description='Search PubMed using E-utilities API',
+        epilog='Example: python search_pubmed.py "CRISPR gene editing" --limit 100'
+    )
+    
+    parser.add_argument(
+        'query',
+        nargs='?',
+        help='Search query (PubMed syntax)'
+    )
+    
+    parser.add_argument(
+        '--query',
+        dest='query_arg',
+        help='Search query (alternative to positional argument)'
+    )
+    
+    parser.add_argument(
+        '--query-file',
+        help='File containing search query'
+    )
+    
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=100,
+        help='Maximum number of results (default: 100)'
+    )
+    
+    parser.add_argument(
+        '--date-start',
+        help='Start date (YYYY/MM/DD or YYYY)'
+    )
+    
+    parser.add_argument(
+        '--date-end',
+        help='End date (YYYY/MM/DD or YYYY)'
+    )
+    
+    parser.add_argument(
+        '--publication-types',
+        help='Comma-separated publication types (e.g., "Review,Clinical Trial")'
+    )
+    
+    parser.add_argument(
+        '-o', '--output',
+        help='Output file (default: stdout)'
+    )
+    
+    parser.add_argument(
+        '--format',
+        choices=['json', 'bibtex'],
+        default='json',
+        help='Output format (default: json)'
+    )
+    
+    parser.add_argument(
+        '--api-key',
+        help='NCBI API key (or set NCBI_API_KEY env var)'
+    )
+    
+    parser.add_argument(
+        '--email',
+        help='Email for Entrez (or set NCBI_EMAIL env var)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Get query
+    query = args.query or args.query_arg
+    
+    if args.query_file:
+        try:
+            with open(args.query_file, 'r', encoding='utf-8') as f:
+                query = f.read().strip()
+        except Exception as e:
+            print(f'Error reading query file: {e}', file=sys.stderr)
+            sys.exit(1)
+    
+    if not query:
+        parser.print_help()
+        sys.exit(1)
+    
+    # Parse publication types
+    pub_types = None
+    if args.publication_types:
+        pub_types = [pt.strip() for pt in args.publication_types.split(',')]
+    
+    # Search PubMed
+    searcher = PubMedSearcher(api_key=args.api_key, email=args.email)
+    pmids = searcher.search(
+        query,
+        max_results=args.limit,
+        date_start=args.date_start,
+        date_end=args.date_end,
+        publication_types=pub_types
+    )
+    
+    if not pmids:
+        print('No results found', file=sys.stderr)
+        sys.exit(1)
+    
+    # Fetch metadata
+    metadata_list = searcher.fetch_metadata(pmids)
+    
+    # Format output
+    if args.format == 'json':
+        output = json.dumps({
+            'query': query,
+            'count': len(metadata_list),
+            'results': metadata_list
+        }, indent=2)
+    else:  # bibtex
+        bibtex_entries = [searcher.metadata_to_bibtex(m) for m in metadata_list]
+        output = '\n\n'.join(bibtex_entries) + '\n'
+    
+    # Write output
+    if args.output:
+        with open(args.output, 'w', encoding='utf-8') as f:
+            f.write(output)
+        print(f'Wrote {len(metadata_list)} results to {args.output}', file=sys.stderr)
+    else:
+        print(output)
+
+
+if __name__ == '__main__':
+    main()
+