Files
gh-k-dense-ai-claude-scient…/skills/biorxiv-database/scripts/biorxiv_search.py
2025-11-30 08:30:10 +08:00

446 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
bioRxiv Search Tool
A comprehensive Python tool for searching and retrieving preprints from bioRxiv.
Supports keyword search, author search, date filtering, category filtering, and more.
Note: This tool is focused exclusively on bioRxiv (life sciences preprints).
"""
import requests
import json
import argparse
from datetime import datetime, timedelta
from typing import List, Dict, Optional, Any
import time
import sys
from urllib.parse import quote
class BioRxivSearcher:
"""Efficient search interface for bioRxiv preprints."""
BASE_URL = "https://api.biorxiv.org"
# Valid bioRxiv categories
CATEGORIES = [
"animal-behavior-and-cognition", "biochemistry", "bioengineering",
"bioinformatics", "biophysics", "cancer-biology", "cell-biology",
"clinical-trials", "developmental-biology", "ecology", "epidemiology",
"evolutionary-biology", "genetics", "genomics", "immunology",
"microbiology", "molecular-biology", "neuroscience", "paleontology",
"pathology", "pharmacology-and-toxicology", "physiology",
"plant-biology", "scientific-communication-and-education",
"synthetic-biology", "systems-biology", "zoology"
]
def __init__(self, verbose: bool = False):
"""Initialize the searcher."""
self.verbose = verbose
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'BioRxiv-Search-Tool/1.0'
})
def _log(self, message: str):
"""Print verbose logging messages."""
if self.verbose:
print(f"[INFO] {message}", file=sys.stderr)
def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
"""Make an API request with error handling and rate limiting."""
url = f"{self.BASE_URL}/{endpoint}"
self._log(f"Requesting: {url}")
try:
response = self.session.get(url, params=params, timeout=30)
response.raise_for_status()
# Rate limiting - be respectful to the API
time.sleep(0.5)
return response.json()
except requests.exceptions.RequestException as e:
self._log(f"Error making request: {e}")
return {"messages": [{"status": "error", "message": str(e)}], "collection": []}
def search_by_date_range(
self,
start_date: str,
end_date: str,
category: Optional[str] = None
) -> List[Dict]:
"""
Search for preprints within a date range.
Args:
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format
category: Optional category filter (e.g., 'neuroscience')
Returns:
List of preprint dictionaries
"""
self._log(f"Searching bioRxiv from {start_date} to {end_date}")
if category:
endpoint = f"details/biorxiv/{start_date}/{end_date}/{category}"
else:
endpoint = f"details/biorxiv/{start_date}/{end_date}"
data = self._make_request(endpoint)
if "collection" in data:
self._log(f"Found {len(data['collection'])} preprints")
return data["collection"]
return []
def search_by_interval(
self,
interval: str = "1",
cursor: int = 0,
format: str = "json"
) -> Dict:
"""
Retrieve preprints from a specific time interval.
Args:
interval: Number of days back to search
cursor: Pagination cursor (0 for first page, then use returned cursor)
format: Response format ('json' or 'xml')
Returns:
Dictionary with collection and pagination info
"""
endpoint = f"pubs/biorxiv/{interval}/{cursor}/{format}"
return self._make_request(endpoint)
def get_paper_details(self, doi: str) -> Dict:
"""
Get detailed information about a specific paper by DOI.
Args:
doi: The DOI of the paper (e.g., '10.1101/2021.01.01.123456')
Returns:
Dictionary with paper details
"""
# Clean DOI if full URL was provided
if 'doi.org' in doi:
doi = doi.split('doi.org/')[-1]
self._log(f"Fetching details for DOI: {doi}")
endpoint = f"details/biorxiv/{doi}"
data = self._make_request(endpoint)
if "collection" in data and len(data["collection"]) > 0:
return data["collection"][0]
return {}
def search_by_author(
self,
author_name: str,
start_date: Optional[str] = None,
end_date: Optional[str] = None
) -> List[Dict]:
"""
Search for papers by author name.
Args:
author_name: Author name to search for
start_date: Optional start date (YYYY-MM-DD)
end_date: Optional end date (YYYY-MM-DD)
Returns:
List of matching preprints
"""
# If no date range specified, search last 3 years
if not start_date:
end_date = datetime.now().strftime("%Y-%m-%d")
start_date = (datetime.now() - timedelta(days=1095)).strftime("%Y-%m-%d")
self._log(f"Searching for author: {author_name}")
# Get all papers in date range
papers = self.search_by_date_range(start_date, end_date)
# Filter by author name (case-insensitive)
author_lower = author_name.lower()
matching_papers = []
for paper in papers:
authors = paper.get("authors", "")
if author_lower in authors.lower():
matching_papers.append(paper)
self._log(f"Found {len(matching_papers)} papers by {author_name}")
return matching_papers
def search_by_keywords(
self,
keywords: List[str],
start_date: Optional[str] = None,
end_date: Optional[str] = None,
category: Optional[str] = None,
search_fields: List[str] = ["title", "abstract"]
) -> List[Dict]:
"""
Search for papers containing specific keywords.
Args:
keywords: List of keywords to search for
start_date: Optional start date (YYYY-MM-DD)
end_date: Optional end date (YYYY-MM-DD)
category: Optional category filter
search_fields: Fields to search in (title, abstract, authors)
Returns:
List of matching preprints
"""
# If no date range specified, search last year
if not start_date:
end_date = datetime.now().strftime("%Y-%m-%d")
start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
self._log(f"Searching for keywords: {keywords}")
# Get all papers in date range
papers = self.search_by_date_range(start_date, end_date, category)
# Filter by keywords
matching_papers = []
keywords_lower = [k.lower() for k in keywords]
for paper in papers:
# Build search text from specified fields
search_text = ""
for field in search_fields:
if field in paper:
search_text += " " + str(paper[field]).lower()
# Check if any keyword matches
if any(keyword in search_text for keyword in keywords_lower):
matching_papers.append(paper)
self._log(f"Found {len(matching_papers)} papers matching keywords")
return matching_papers
def download_pdf(self, doi: str, output_path: str) -> bool:
"""
Download the PDF of a paper.
Args:
doi: The DOI of the paper
output_path: Path where PDF should be saved
Returns:
True if download successful, False otherwise
"""
# Clean DOI
if 'doi.org' in doi:
doi = doi.split('doi.org/')[-1]
# Construct PDF URL
pdf_url = f"https://www.biorxiv.org/content/{doi}v1.full.pdf"
self._log(f"Downloading PDF from: {pdf_url}")
try:
response = self.session.get(pdf_url, timeout=60)
response.raise_for_status()
with open(output_path, 'wb') as f:
f.write(response.content)
self._log(f"PDF saved to: {output_path}")
return True
except Exception as e:
self._log(f"Error downloading PDF: {e}")
return False
def format_result(self, paper: Dict, include_abstract: bool = True) -> Dict:
"""
Format a paper result with standardized fields.
Args:
paper: Raw paper dictionary from API
include_abstract: Whether to include the abstract
Returns:
Formatted paper dictionary
"""
result = {
"doi": paper.get("doi", ""),
"title": paper.get("title", ""),
"authors": paper.get("authors", ""),
"author_corresponding": paper.get("author_corresponding", ""),
"author_corresponding_institution": paper.get("author_corresponding_institution", ""),
"date": paper.get("date", ""),
"version": paper.get("version", ""),
"type": paper.get("type", ""),
"license": paper.get("license", ""),
"category": paper.get("category", ""),
"jatsxml": paper.get("jatsxml", ""),
"published": paper.get("published", "")
}
if include_abstract:
result["abstract"] = paper.get("abstract", "")
# Add PDF and HTML URLs
if result["doi"]:
result["pdf_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}.full.pdf"
result["html_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}"
return result
def main():
"""Command-line interface for bioRxiv search."""
parser = argparse.ArgumentParser(
description="Search bioRxiv preprints efficiently",
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("--verbose", "-v", action="store_true",
help="Enable verbose logging")
# Search type arguments
search_group = parser.add_argument_group("Search options")
search_group.add_argument("--keywords", "-k", nargs="+",
help="Keywords to search for")
search_group.add_argument("--author", "-a",
help="Author name to search for")
search_group.add_argument("--doi",
help="Get details for specific DOI")
# Date range arguments
date_group = parser.add_argument_group("Date range options")
date_group.add_argument("--start-date",
help="Start date (YYYY-MM-DD)")
date_group.add_argument("--end-date",
help="End date (YYYY-MM-DD)")
date_group.add_argument("--days-back", type=int,
help="Search N days back from today")
# Filter arguments
filter_group = parser.add_argument_group("Filter options")
filter_group.add_argument("--category", "-c",
choices=BioRxivSearcher.CATEGORIES,
help="Filter by category")
filter_group.add_argument("--search-fields", nargs="+",
default=["title", "abstract"],
choices=["title", "abstract", "authors"],
help="Fields to search in for keywords")
# Output arguments
output_group = parser.add_argument_group("Output options")
output_group.add_argument("--output", "-o",
help="Output file (default: stdout)")
output_group.add_argument("--include-abstract", action="store_true",
default=True, help="Include abstracts in output")
output_group.add_argument("--download-pdf",
help="Download PDF to specified path (requires --doi)")
output_group.add_argument("--limit", type=int,
help="Limit number of results")
args = parser.parse_args()
# Initialize searcher
searcher = BioRxivSearcher(verbose=args.verbose)
# Handle date range
end_date = args.end_date or datetime.now().strftime("%Y-%m-%d")
if args.days_back:
start_date = (datetime.now() - timedelta(days=args.days_back)).strftime("%Y-%m-%d")
else:
start_date = args.start_date
# Execute search based on arguments
results = []
if args.download_pdf:
if not args.doi:
print("Error: --doi required with --download-pdf", file=sys.stderr)
return 1
success = searcher.download_pdf(args.doi, args.download_pdf)
return 0 if success else 1
elif args.doi:
# Get specific paper by DOI
paper = searcher.get_paper_details(args.doi)
if paper:
results = [paper]
elif args.author:
# Search by author
results = searcher.search_by_author(
args.author, start_date, end_date
)
elif args.keywords:
# Search by keywords
if not start_date:
print("Error: --start-date or --days-back required for keyword search",
file=sys.stderr)
return 1
results = searcher.search_by_keywords(
args.keywords, start_date, end_date,
args.category, args.search_fields
)
else:
# Date range search
if not start_date:
print("Error: Must specify search criteria (--keywords, --author, or --doi)",
file=sys.stderr)
return 1
results = searcher.search_by_date_range(
start_date, end_date, args.category
)
# Apply limit
if args.limit:
results = results[:args.limit]
# Format results
formatted_results = [
searcher.format_result(paper, args.include_abstract)
for paper in results
]
# Output results
output_data = {
"query": {
"keywords": args.keywords,
"author": args.author,
"doi": args.doi,
"start_date": start_date,
"end_date": end_date,
"category": args.category
},
"result_count": len(formatted_results),
"results": formatted_results
}
output_json = json.dumps(output_data, indent=2)
if args.output:
with open(args.output, 'w') as f:
f.write(output_json)
print(f"Results written to {args.output}", file=sys.stderr)
else:
print(output_json)
return 0
if __name__ == "__main__":
sys.exit(main())