446 lines
14 KiB
Python
Executable File
446 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
bioRxiv Search Tool
|
|
A comprehensive Python tool for searching and retrieving preprints from bioRxiv.
|
|
Supports keyword search, author search, date filtering, category filtering, and more.
|
|
|
|
Note: This tool is focused exclusively on bioRxiv (life sciences preprints).
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
import argparse
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Optional, Any
|
|
import time
|
|
import sys
|
|
from urllib.parse import quote
|
|
|
|
|
|
class BioRxivSearcher:
|
|
"""Efficient search interface for bioRxiv preprints."""
|
|
|
|
BASE_URL = "https://api.biorxiv.org"
|
|
|
|
# Valid bioRxiv categories
|
|
CATEGORIES = [
|
|
"animal-behavior-and-cognition", "biochemistry", "bioengineering",
|
|
"bioinformatics", "biophysics", "cancer-biology", "cell-biology",
|
|
"clinical-trials", "developmental-biology", "ecology", "epidemiology",
|
|
"evolutionary-biology", "genetics", "genomics", "immunology",
|
|
"microbiology", "molecular-biology", "neuroscience", "paleontology",
|
|
"pathology", "pharmacology-and-toxicology", "physiology",
|
|
"plant-biology", "scientific-communication-and-education",
|
|
"synthetic-biology", "systems-biology", "zoology"
|
|
]
|
|
|
|
def __init__(self, verbose: bool = False):
|
|
"""Initialize the searcher."""
|
|
self.verbose = verbose
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'BioRxiv-Search-Tool/1.0'
|
|
})
|
|
|
|
def _log(self, message: str):
|
|
"""Print verbose logging messages."""
|
|
if self.verbose:
|
|
print(f"[INFO] {message}", file=sys.stderr)
|
|
|
|
def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
|
|
"""Make an API request with error handling and rate limiting."""
|
|
url = f"{self.BASE_URL}/{endpoint}"
|
|
self._log(f"Requesting: {url}")
|
|
|
|
try:
|
|
response = self.session.get(url, params=params, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
# Rate limiting - be respectful to the API
|
|
time.sleep(0.5)
|
|
|
|
return response.json()
|
|
except requests.exceptions.RequestException as e:
|
|
self._log(f"Error making request: {e}")
|
|
return {"messages": [{"status": "error", "message": str(e)}], "collection": []}
|
|
|
|
def search_by_date_range(
|
|
self,
|
|
start_date: str,
|
|
end_date: str,
|
|
category: Optional[str] = None
|
|
) -> List[Dict]:
|
|
"""
|
|
Search for preprints within a date range.
|
|
|
|
Args:
|
|
start_date: Start date in YYYY-MM-DD format
|
|
end_date: End date in YYYY-MM-DD format
|
|
category: Optional category filter (e.g., 'neuroscience')
|
|
|
|
Returns:
|
|
List of preprint dictionaries
|
|
"""
|
|
self._log(f"Searching bioRxiv from {start_date} to {end_date}")
|
|
|
|
if category:
|
|
endpoint = f"details/biorxiv/{start_date}/{end_date}/{category}"
|
|
else:
|
|
endpoint = f"details/biorxiv/{start_date}/{end_date}"
|
|
|
|
data = self._make_request(endpoint)
|
|
|
|
if "collection" in data:
|
|
self._log(f"Found {len(data['collection'])} preprints")
|
|
return data["collection"]
|
|
|
|
return []
|
|
|
|
def search_by_interval(
|
|
self,
|
|
interval: str = "1",
|
|
cursor: int = 0,
|
|
format: str = "json"
|
|
) -> Dict:
|
|
"""
|
|
Retrieve preprints from a specific time interval.
|
|
|
|
Args:
|
|
interval: Number of days back to search
|
|
cursor: Pagination cursor (0 for first page, then use returned cursor)
|
|
format: Response format ('json' or 'xml')
|
|
|
|
Returns:
|
|
Dictionary with collection and pagination info
|
|
"""
|
|
endpoint = f"pubs/biorxiv/{interval}/{cursor}/{format}"
|
|
return self._make_request(endpoint)
|
|
|
|
def get_paper_details(self, doi: str) -> Dict:
|
|
"""
|
|
Get detailed information about a specific paper by DOI.
|
|
|
|
Args:
|
|
doi: The DOI of the paper (e.g., '10.1101/2021.01.01.123456')
|
|
|
|
Returns:
|
|
Dictionary with paper details
|
|
"""
|
|
# Clean DOI if full URL was provided
|
|
if 'doi.org' in doi:
|
|
doi = doi.split('doi.org/')[-1]
|
|
|
|
self._log(f"Fetching details for DOI: {doi}")
|
|
endpoint = f"details/biorxiv/{doi}"
|
|
|
|
data = self._make_request(endpoint)
|
|
|
|
if "collection" in data and len(data["collection"]) > 0:
|
|
return data["collection"][0]
|
|
|
|
return {}
|
|
|
|
def search_by_author(
|
|
self,
|
|
author_name: str,
|
|
start_date: Optional[str] = None,
|
|
end_date: Optional[str] = None
|
|
) -> List[Dict]:
|
|
"""
|
|
Search for papers by author name.
|
|
|
|
Args:
|
|
author_name: Author name to search for
|
|
start_date: Optional start date (YYYY-MM-DD)
|
|
end_date: Optional end date (YYYY-MM-DD)
|
|
|
|
Returns:
|
|
List of matching preprints
|
|
"""
|
|
# If no date range specified, search last 3 years
|
|
if not start_date:
|
|
end_date = datetime.now().strftime("%Y-%m-%d")
|
|
start_date = (datetime.now() - timedelta(days=1095)).strftime("%Y-%m-%d")
|
|
|
|
self._log(f"Searching for author: {author_name}")
|
|
|
|
# Get all papers in date range
|
|
papers = self.search_by_date_range(start_date, end_date)
|
|
|
|
# Filter by author name (case-insensitive)
|
|
author_lower = author_name.lower()
|
|
matching_papers = []
|
|
|
|
for paper in papers:
|
|
authors = paper.get("authors", "")
|
|
if author_lower in authors.lower():
|
|
matching_papers.append(paper)
|
|
|
|
self._log(f"Found {len(matching_papers)} papers by {author_name}")
|
|
return matching_papers
|
|
|
|
def search_by_keywords(
|
|
self,
|
|
keywords: List[str],
|
|
start_date: Optional[str] = None,
|
|
end_date: Optional[str] = None,
|
|
category: Optional[str] = None,
|
|
search_fields: List[str] = ["title", "abstract"]
|
|
) -> List[Dict]:
|
|
"""
|
|
Search for papers containing specific keywords.
|
|
|
|
Args:
|
|
keywords: List of keywords to search for
|
|
start_date: Optional start date (YYYY-MM-DD)
|
|
end_date: Optional end date (YYYY-MM-DD)
|
|
category: Optional category filter
|
|
search_fields: Fields to search in (title, abstract, authors)
|
|
|
|
Returns:
|
|
List of matching preprints
|
|
"""
|
|
# If no date range specified, search last year
|
|
if not start_date:
|
|
end_date = datetime.now().strftime("%Y-%m-%d")
|
|
start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
|
|
|
|
self._log(f"Searching for keywords: {keywords}")
|
|
|
|
# Get all papers in date range
|
|
papers = self.search_by_date_range(start_date, end_date, category)
|
|
|
|
# Filter by keywords
|
|
matching_papers = []
|
|
keywords_lower = [k.lower() for k in keywords]
|
|
|
|
for paper in papers:
|
|
# Build search text from specified fields
|
|
search_text = ""
|
|
for field in search_fields:
|
|
if field in paper:
|
|
search_text += " " + str(paper[field]).lower()
|
|
|
|
# Check if any keyword matches
|
|
if any(keyword in search_text for keyword in keywords_lower):
|
|
matching_papers.append(paper)
|
|
|
|
self._log(f"Found {len(matching_papers)} papers matching keywords")
|
|
return matching_papers
|
|
|
|
def download_pdf(self, doi: str, output_path: str) -> bool:
|
|
"""
|
|
Download the PDF of a paper.
|
|
|
|
Args:
|
|
doi: The DOI of the paper
|
|
output_path: Path where PDF should be saved
|
|
|
|
Returns:
|
|
True if download successful, False otherwise
|
|
"""
|
|
# Clean DOI
|
|
if 'doi.org' in doi:
|
|
doi = doi.split('doi.org/')[-1]
|
|
|
|
# Construct PDF URL
|
|
pdf_url = f"https://www.biorxiv.org/content/{doi}v1.full.pdf"
|
|
|
|
self._log(f"Downloading PDF from: {pdf_url}")
|
|
|
|
try:
|
|
response = self.session.get(pdf_url, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
with open(output_path, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
self._log(f"PDF saved to: {output_path}")
|
|
return True
|
|
except Exception as e:
|
|
self._log(f"Error downloading PDF: {e}")
|
|
return False
|
|
|
|
def format_result(self, paper: Dict, include_abstract: bool = True) -> Dict:
|
|
"""
|
|
Format a paper result with standardized fields.
|
|
|
|
Args:
|
|
paper: Raw paper dictionary from API
|
|
include_abstract: Whether to include the abstract
|
|
|
|
Returns:
|
|
Formatted paper dictionary
|
|
"""
|
|
result = {
|
|
"doi": paper.get("doi", ""),
|
|
"title": paper.get("title", ""),
|
|
"authors": paper.get("authors", ""),
|
|
"author_corresponding": paper.get("author_corresponding", ""),
|
|
"author_corresponding_institution": paper.get("author_corresponding_institution", ""),
|
|
"date": paper.get("date", ""),
|
|
"version": paper.get("version", ""),
|
|
"type": paper.get("type", ""),
|
|
"license": paper.get("license", ""),
|
|
"category": paper.get("category", ""),
|
|
"jatsxml": paper.get("jatsxml", ""),
|
|
"published": paper.get("published", "")
|
|
}
|
|
|
|
if include_abstract:
|
|
result["abstract"] = paper.get("abstract", "")
|
|
|
|
# Add PDF and HTML URLs
|
|
if result["doi"]:
|
|
result["pdf_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}.full.pdf"
|
|
result["html_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}"
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
"""Command-line interface for bioRxiv search."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Search bioRxiv preprints efficiently",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
)
|
|
|
|
parser.add_argument("--verbose", "-v", action="store_true",
|
|
help="Enable verbose logging")
|
|
|
|
# Search type arguments
|
|
search_group = parser.add_argument_group("Search options")
|
|
search_group.add_argument("--keywords", "-k", nargs="+",
|
|
help="Keywords to search for")
|
|
search_group.add_argument("--author", "-a",
|
|
help="Author name to search for")
|
|
search_group.add_argument("--doi",
|
|
help="Get details for specific DOI")
|
|
|
|
# Date range arguments
|
|
date_group = parser.add_argument_group("Date range options")
|
|
date_group.add_argument("--start-date",
|
|
help="Start date (YYYY-MM-DD)")
|
|
date_group.add_argument("--end-date",
|
|
help="End date (YYYY-MM-DD)")
|
|
date_group.add_argument("--days-back", type=int,
|
|
help="Search N days back from today")
|
|
|
|
# Filter arguments
|
|
filter_group = parser.add_argument_group("Filter options")
|
|
filter_group.add_argument("--category", "-c",
|
|
choices=BioRxivSearcher.CATEGORIES,
|
|
help="Filter by category")
|
|
filter_group.add_argument("--search-fields", nargs="+",
|
|
default=["title", "abstract"],
|
|
choices=["title", "abstract", "authors"],
|
|
help="Fields to search in for keywords")
|
|
|
|
# Output arguments
|
|
output_group = parser.add_argument_group("Output options")
|
|
output_group.add_argument("--output", "-o",
|
|
help="Output file (default: stdout)")
|
|
output_group.add_argument("--include-abstract", action="store_true",
|
|
default=True, help="Include abstracts in output")
|
|
output_group.add_argument("--download-pdf",
|
|
help="Download PDF to specified path (requires --doi)")
|
|
output_group.add_argument("--limit", type=int,
|
|
help="Limit number of results")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Initialize searcher
|
|
searcher = BioRxivSearcher(verbose=args.verbose)
|
|
|
|
# Handle date range
|
|
end_date = args.end_date or datetime.now().strftime("%Y-%m-%d")
|
|
if args.days_back:
|
|
start_date = (datetime.now() - timedelta(days=args.days_back)).strftime("%Y-%m-%d")
|
|
else:
|
|
start_date = args.start_date
|
|
|
|
# Execute search based on arguments
|
|
results = []
|
|
|
|
if args.download_pdf:
|
|
if not args.doi:
|
|
print("Error: --doi required with --download-pdf", file=sys.stderr)
|
|
return 1
|
|
|
|
success = searcher.download_pdf(args.doi, args.download_pdf)
|
|
return 0 if success else 1
|
|
|
|
elif args.doi:
|
|
# Get specific paper by DOI
|
|
paper = searcher.get_paper_details(args.doi)
|
|
if paper:
|
|
results = [paper]
|
|
|
|
elif args.author:
|
|
# Search by author
|
|
results = searcher.search_by_author(
|
|
args.author, start_date, end_date
|
|
)
|
|
|
|
elif args.keywords:
|
|
# Search by keywords
|
|
if not start_date:
|
|
print("Error: --start-date or --days-back required for keyword search",
|
|
file=sys.stderr)
|
|
return 1
|
|
|
|
results = searcher.search_by_keywords(
|
|
args.keywords, start_date, end_date,
|
|
args.category, args.search_fields
|
|
)
|
|
|
|
else:
|
|
# Date range search
|
|
if not start_date:
|
|
print("Error: Must specify search criteria (--keywords, --author, or --doi)",
|
|
file=sys.stderr)
|
|
return 1
|
|
|
|
results = searcher.search_by_date_range(
|
|
start_date, end_date, args.category
|
|
)
|
|
|
|
# Apply limit
|
|
if args.limit:
|
|
results = results[:args.limit]
|
|
|
|
# Format results
|
|
formatted_results = [
|
|
searcher.format_result(paper, args.include_abstract)
|
|
for paper in results
|
|
]
|
|
|
|
# Output results
|
|
output_data = {
|
|
"query": {
|
|
"keywords": args.keywords,
|
|
"author": args.author,
|
|
"doi": args.doi,
|
|
"start_date": start_date,
|
|
"end_date": end_date,
|
|
"category": args.category
|
|
},
|
|
"result_count": len(formatted_results),
|
|
"results": formatted_results
|
|
}
|
|
|
|
output_json = json.dumps(output_data, indent=2)
|
|
|
|
if args.output:
|
|
with open(args.output, 'w') as f:
|
|
f.write(output_json)
|
|
print(f"Results written to {args.output}", file=sys.stderr)
|
|
else:
|
|
print(output_json)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|