Initial commit
This commit is contained in:
398
skills/citation-management/scripts/search_pubmed.py
Executable file
398
skills/citation-management/scripts/search_pubmed.py
Executable file
@@ -0,0 +1,398 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PubMed Search Tool
|
||||
Search PubMed using E-utilities API and export results.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
|
||||
class PubMedSearcher:
|
||||
"""Search PubMed using NCBI E-utilities API."""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, email: Optional[str] = None):
|
||||
"""
|
||||
Initialize searcher.
|
||||
|
||||
Args:
|
||||
api_key: NCBI API key (optional but recommended)
|
||||
email: Email for Entrez (optional but recommended)
|
||||
"""
|
||||
self.api_key = api_key or os.getenv('NCBI_API_KEY', '')
|
||||
self.email = email or os.getenv('NCBI_EMAIL', '')
|
||||
self.base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
||||
self.session = requests.Session()
|
||||
|
||||
# Rate limiting
|
||||
self.delay = 0.11 if self.api_key else 0.34 # 10/sec with key, 3/sec without
|
||||
|
||||
def search(self, query: str, max_results: int = 100,
|
||||
date_start: Optional[str] = None, date_end: Optional[str] = None,
|
||||
publication_types: Optional[List[str]] = None) -> List[str]:
|
||||
"""
|
||||
Search PubMed and return PMIDs.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
max_results: Maximum number of results
|
||||
date_start: Start date (YYYY/MM/DD or YYYY)
|
||||
date_end: End date (YYYY/MM/DD or YYYY)
|
||||
publication_types: List of publication types to filter
|
||||
|
||||
Returns:
|
||||
List of PMIDs
|
||||
"""
|
||||
# Build query with filters
|
||||
full_query = query
|
||||
|
||||
# Add date range
|
||||
if date_start or date_end:
|
||||
start = date_start or '1900'
|
||||
end = date_end or datetime.now().strftime('%Y')
|
||||
full_query += f' AND {start}:{end}[Publication Date]'
|
||||
|
||||
# Add publication types
|
||||
if publication_types:
|
||||
pub_type_query = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
|
||||
full_query += f' AND ({pub_type_query})'
|
||||
|
||||
print(f'Searching PubMed: {full_query}', file=sys.stderr)
|
||||
|
||||
# ESearch to get PMIDs
|
||||
esearch_url = self.base_url + 'esearch.fcgi'
|
||||
params = {
|
||||
'db': 'pubmed',
|
||||
'term': full_query,
|
||||
'retmax': max_results,
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if self.email:
|
||||
params['email'] = self.email
|
||||
if self.api_key:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(esearch_url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
pmids = data['esearchresult']['idlist']
|
||||
count = int(data['esearchresult']['count'])
|
||||
|
||||
print(f'Found {count} results, retrieving {len(pmids)}', file=sys.stderr)
|
||||
|
||||
return pmids
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error searching PubMed: {e}', file=sys.stderr)
|
||||
return []
|
||||
|
||||
def fetch_metadata(self, pmids: List[str]) -> List[Dict]:
|
||||
"""
|
||||
Fetch metadata for PMIDs.
|
||||
|
||||
Args:
|
||||
pmids: List of PubMed IDs
|
||||
|
||||
Returns:
|
||||
List of metadata dictionaries
|
||||
"""
|
||||
if not pmids:
|
||||
return []
|
||||
|
||||
metadata_list = []
|
||||
|
||||
# Fetch in batches of 200
|
||||
batch_size = 200
|
||||
for i in range(0, len(pmids), batch_size):
|
||||
batch = pmids[i:i+batch_size]
|
||||
print(f'Fetching metadata for PMIDs {i+1}-{min(i+batch_size, len(pmids))}...', file=sys.stderr)
|
||||
|
||||
efetch_url = self.base_url + 'efetch.fcgi'
|
||||
params = {
|
||||
'db': 'pubmed',
|
||||
'id': ','.join(batch),
|
||||
'retmode': 'xml',
|
||||
'rettype': 'abstract'
|
||||
}
|
||||
|
||||
if self.email:
|
||||
params['email'] = self.email
|
||||
if self.api_key:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(efetch_url, params=params, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(response.content)
|
||||
articles = root.findall('.//PubmedArticle')
|
||||
|
||||
for article in articles:
|
||||
metadata = self._extract_metadata_from_xml(article)
|
||||
if metadata:
|
||||
metadata_list.append(metadata)
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(self.delay)
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error fetching metadata for batch: {e}', file=sys.stderr)
|
||||
continue
|
||||
|
||||
return metadata_list
|
||||
|
||||
def _extract_metadata_from_xml(self, article: ET.Element) -> Optional[Dict]:
|
||||
"""Extract metadata from PubmedArticle XML element."""
|
||||
try:
|
||||
medline_citation = article.find('.//MedlineCitation')
|
||||
article_elem = medline_citation.find('.//Article')
|
||||
journal = article_elem.find('.//Journal')
|
||||
|
||||
# Get PMID
|
||||
pmid = medline_citation.findtext('.//PMID', '')
|
||||
|
||||
# Get DOI
|
||||
doi = None
|
||||
article_ids = article.findall('.//ArticleId')
|
||||
for article_id in article_ids:
|
||||
if article_id.get('IdType') == 'doi':
|
||||
doi = article_id.text
|
||||
break
|
||||
|
||||
# Get authors
|
||||
authors = []
|
||||
author_list = article_elem.find('.//AuthorList')
|
||||
if author_list is not None:
|
||||
for author in author_list.findall('.//Author'):
|
||||
last_name = author.findtext('.//LastName', '')
|
||||
fore_name = author.findtext('.//ForeName', '')
|
||||
if last_name:
|
||||
if fore_name:
|
||||
authors.append(f'{last_name}, {fore_name}')
|
||||
else:
|
||||
authors.append(last_name)
|
||||
|
||||
# Get year
|
||||
year = article_elem.findtext('.//Journal/JournalIssue/PubDate/Year', '')
|
||||
if not year:
|
||||
medline_date = article_elem.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
|
||||
if medline_date:
|
||||
import re
|
||||
year_match = re.search(r'\d{4}', medline_date)
|
||||
if year_match:
|
||||
year = year_match.group()
|
||||
|
||||
metadata = {
|
||||
'pmid': pmid,
|
||||
'doi': doi,
|
||||
'title': article_elem.findtext('.//ArticleTitle', ''),
|
||||
'authors': ' and '.join(authors),
|
||||
'journal': journal.findtext('.//Title', ''),
|
||||
'year': year,
|
||||
'volume': journal.findtext('.//JournalIssue/Volume', ''),
|
||||
'issue': journal.findtext('.//JournalIssue/Issue', ''),
|
||||
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
|
||||
'abstract': article_elem.findtext('.//Abstract/AbstractText', '')
|
||||
}
|
||||
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def metadata_to_bibtex(self, metadata: Dict) -> str:
|
||||
"""Convert metadata to BibTeX format."""
|
||||
# Generate citation key
|
||||
if metadata.get('authors'):
|
||||
first_author = metadata['authors'].split(' and ')[0]
|
||||
if ',' in first_author:
|
||||
last_name = first_author.split(',')[0].strip()
|
||||
else:
|
||||
last_name = first_author.split()[0]
|
||||
else:
|
||||
last_name = 'Unknown'
|
||||
|
||||
year = metadata.get('year', 'XXXX')
|
||||
citation_key = f'{last_name}{year}pmid{metadata.get("pmid", "")}'
|
||||
|
||||
# Build BibTeX entry
|
||||
lines = [f'@article{{{citation_key},']
|
||||
|
||||
if metadata.get('authors'):
|
||||
lines.append(f' author = {{{metadata["authors"]}}},')
|
||||
|
||||
if metadata.get('title'):
|
||||
lines.append(f' title = {{{metadata["title"]}}},')
|
||||
|
||||
if metadata.get('journal'):
|
||||
lines.append(f' journal = {{{metadata["journal"]}}},')
|
||||
|
||||
if metadata.get('year'):
|
||||
lines.append(f' year = {{{metadata["year"]}}},')
|
||||
|
||||
if metadata.get('volume'):
|
||||
lines.append(f' volume = {{{metadata["volume"]}}},')
|
||||
|
||||
if metadata.get('issue'):
|
||||
lines.append(f' number = {{{metadata["issue"]}}},')
|
||||
|
||||
if metadata.get('pages'):
|
||||
pages = metadata['pages'].replace('-', '--')
|
||||
lines.append(f' pages = {{{pages}}},')
|
||||
|
||||
if metadata.get('doi'):
|
||||
lines.append(f' doi = {{{metadata["doi"]}}},')
|
||||
|
||||
if metadata.get('pmid'):
|
||||
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
|
||||
|
||||
# Remove trailing comma
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Search PubMed using E-utilities API',
|
||||
epilog='Example: python search_pubmed.py "CRISPR gene editing" --limit 100'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'query',
|
||||
nargs='?',
|
||||
help='Search query (PubMed syntax)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--query',
|
||||
dest='query_arg',
|
||||
help='Search query (alternative to positional argument)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--query-file',
|
||||
help='File containing search query'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
default=100,
|
||||
help='Maximum number of results (default: 100)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--date-start',
|
||||
help='Start date (YYYY/MM/DD or YYYY)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--date-end',
|
||||
help='End date (YYYY/MM/DD or YYYY)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-types',
|
||||
help='Comma-separated publication types (e.g., "Review,Clinical Trial")'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file (default: stdout)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['json', 'bibtex'],
|
||||
default='json',
|
||||
help='Output format (default: json)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--api-key',
|
||||
help='NCBI API key (or set NCBI_API_KEY env var)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--email',
|
||||
help='Email for Entrez (or set NCBI_EMAIL env var)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get query
|
||||
query = args.query or args.query_arg
|
||||
|
||||
if args.query_file:
|
||||
try:
|
||||
with open(args.query_file, 'r', encoding='utf-8') as f:
|
||||
query = f.read().strip()
|
||||
except Exception as e:
|
||||
print(f'Error reading query file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not query:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Parse publication types
|
||||
pub_types = None
|
||||
if args.publication_types:
|
||||
pub_types = [pt.strip() for pt in args.publication_types.split(',')]
|
||||
|
||||
# Search PubMed
|
||||
searcher = PubMedSearcher(api_key=args.api_key, email=args.email)
|
||||
pmids = searcher.search(
|
||||
query,
|
||||
max_results=args.limit,
|
||||
date_start=args.date_start,
|
||||
date_end=args.date_end,
|
||||
publication_types=pub_types
|
||||
)
|
||||
|
||||
if not pmids:
|
||||
print('No results found', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Fetch metadata
|
||||
metadata_list = searcher.fetch_metadata(pmids)
|
||||
|
||||
# Format output
|
||||
if args.format == 'json':
|
||||
output = json.dumps({
|
||||
'query': query,
|
||||
'count': len(metadata_list),
|
||||
'results': metadata_list
|
||||
}, indent=2)
|
||||
else: # bibtex
|
||||
bibtex_entries = [searcher.metadata_to_bibtex(m) for m in metadata_list]
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'Wrote {len(metadata_list)} results to {args.output}', file=sys.stderr)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user