Initial commit
This commit is contained in:
445
skills/biorxiv-database/scripts/biorxiv_search.py
Executable file
445
skills/biorxiv-database/scripts/biorxiv_search.py
Executable file
@@ -0,0 +1,445 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
bioRxiv Search Tool
|
||||
A comprehensive Python tool for searching and retrieving preprints from bioRxiv.
|
||||
Supports keyword search, author search, date filtering, category filtering, and more.
|
||||
|
||||
Note: This tool is focused exclusively on bioRxiv (life sciences preprints).
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import argparse
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Optional, Any
|
||||
import time
|
||||
import sys
|
||||
from urllib.parse import quote
|
||||
|
||||
|
||||
class BioRxivSearcher:
|
||||
"""Efficient search interface for bioRxiv preprints."""
|
||||
|
||||
BASE_URL = "https://api.biorxiv.org"
|
||||
|
||||
# Valid bioRxiv categories
|
||||
CATEGORIES = [
|
||||
"animal-behavior-and-cognition", "biochemistry", "bioengineering",
|
||||
"bioinformatics", "biophysics", "cancer-biology", "cell-biology",
|
||||
"clinical-trials", "developmental-biology", "ecology", "epidemiology",
|
||||
"evolutionary-biology", "genetics", "genomics", "immunology",
|
||||
"microbiology", "molecular-biology", "neuroscience", "paleontology",
|
||||
"pathology", "pharmacology-and-toxicology", "physiology",
|
||||
"plant-biology", "scientific-communication-and-education",
|
||||
"synthetic-biology", "systems-biology", "zoology"
|
||||
]
|
||||
|
||||
def __init__(self, verbose: bool = False):
|
||||
"""Initialize the searcher."""
|
||||
self.verbose = verbose
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'BioRxiv-Search-Tool/1.0'
|
||||
})
|
||||
|
||||
def _log(self, message: str):
|
||||
"""Print verbose logging messages."""
|
||||
if self.verbose:
|
||||
print(f"[INFO] {message}", file=sys.stderr)
|
||||
|
||||
def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
|
||||
"""Make an API request with error handling and rate limiting."""
|
||||
url = f"{self.BASE_URL}/{endpoint}"
|
||||
self._log(f"Requesting: {url}")
|
||||
|
||||
try:
|
||||
response = self.session.get(url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# Rate limiting - be respectful to the API
|
||||
time.sleep(0.5)
|
||||
|
||||
return response.json()
|
||||
except requests.exceptions.RequestException as e:
|
||||
self._log(f"Error making request: {e}")
|
||||
return {"messages": [{"status": "error", "message": str(e)}], "collection": []}
|
||||
|
||||
def search_by_date_range(
|
||||
self,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
category: Optional[str] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Search for preprints within a date range.
|
||||
|
||||
Args:
|
||||
start_date: Start date in YYYY-MM-DD format
|
||||
end_date: End date in YYYY-MM-DD format
|
||||
category: Optional category filter (e.g., 'neuroscience')
|
||||
|
||||
Returns:
|
||||
List of preprint dictionaries
|
||||
"""
|
||||
self._log(f"Searching bioRxiv from {start_date} to {end_date}")
|
||||
|
||||
if category:
|
||||
endpoint = f"details/biorxiv/{start_date}/{end_date}/{category}"
|
||||
else:
|
||||
endpoint = f"details/biorxiv/{start_date}/{end_date}"
|
||||
|
||||
data = self._make_request(endpoint)
|
||||
|
||||
if "collection" in data:
|
||||
self._log(f"Found {len(data['collection'])} preprints")
|
||||
return data["collection"]
|
||||
|
||||
return []
|
||||
|
||||
def search_by_interval(
|
||||
self,
|
||||
interval: str = "1",
|
||||
cursor: int = 0,
|
||||
format: str = "json"
|
||||
) -> Dict:
|
||||
"""
|
||||
Retrieve preprints from a specific time interval.
|
||||
|
||||
Args:
|
||||
interval: Number of days back to search
|
||||
cursor: Pagination cursor (0 for first page, then use returned cursor)
|
||||
format: Response format ('json' or 'xml')
|
||||
|
||||
Returns:
|
||||
Dictionary with collection and pagination info
|
||||
"""
|
||||
endpoint = f"pubs/biorxiv/{interval}/{cursor}/{format}"
|
||||
return self._make_request(endpoint)
|
||||
|
||||
def get_paper_details(self, doi: str) -> Dict:
|
||||
"""
|
||||
Get detailed information about a specific paper by DOI.
|
||||
|
||||
Args:
|
||||
doi: The DOI of the paper (e.g., '10.1101/2021.01.01.123456')
|
||||
|
||||
Returns:
|
||||
Dictionary with paper details
|
||||
"""
|
||||
# Clean DOI if full URL was provided
|
||||
if 'doi.org' in doi:
|
||||
doi = doi.split('doi.org/')[-1]
|
||||
|
||||
self._log(f"Fetching details for DOI: {doi}")
|
||||
endpoint = f"details/biorxiv/{doi}"
|
||||
|
||||
data = self._make_request(endpoint)
|
||||
|
||||
if "collection" in data and len(data["collection"]) > 0:
|
||||
return data["collection"][0]
|
||||
|
||||
return {}
|
||||
|
||||
def search_by_author(
|
||||
self,
|
||||
author_name: str,
|
||||
start_date: Optional[str] = None,
|
||||
end_date: Optional[str] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Search for papers by author name.
|
||||
|
||||
Args:
|
||||
author_name: Author name to search for
|
||||
start_date: Optional start date (YYYY-MM-DD)
|
||||
end_date: Optional end date (YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
List of matching preprints
|
||||
"""
|
||||
# If no date range specified, search last 3 years
|
||||
if not start_date:
|
||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||
start_date = (datetime.now() - timedelta(days=1095)).strftime("%Y-%m-%d")
|
||||
|
||||
self._log(f"Searching for author: {author_name}")
|
||||
|
||||
# Get all papers in date range
|
||||
papers = self.search_by_date_range(start_date, end_date)
|
||||
|
||||
# Filter by author name (case-insensitive)
|
||||
author_lower = author_name.lower()
|
||||
matching_papers = []
|
||||
|
||||
for paper in papers:
|
||||
authors = paper.get("authors", "")
|
||||
if author_lower in authors.lower():
|
||||
matching_papers.append(paper)
|
||||
|
||||
self._log(f"Found {len(matching_papers)} papers by {author_name}")
|
||||
return matching_papers
|
||||
|
||||
def search_by_keywords(
|
||||
self,
|
||||
keywords: List[str],
|
||||
start_date: Optional[str] = None,
|
||||
end_date: Optional[str] = None,
|
||||
category: Optional[str] = None,
|
||||
search_fields: List[str] = ["title", "abstract"]
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Search for papers containing specific keywords.
|
||||
|
||||
Args:
|
||||
keywords: List of keywords to search for
|
||||
start_date: Optional start date (YYYY-MM-DD)
|
||||
end_date: Optional end date (YYYY-MM-DD)
|
||||
category: Optional category filter
|
||||
search_fields: Fields to search in (title, abstract, authors)
|
||||
|
||||
Returns:
|
||||
List of matching preprints
|
||||
"""
|
||||
# If no date range specified, search last year
|
||||
if not start_date:
|
||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||
start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
|
||||
|
||||
self._log(f"Searching for keywords: {keywords}")
|
||||
|
||||
# Get all papers in date range
|
||||
papers = self.search_by_date_range(start_date, end_date, category)
|
||||
|
||||
# Filter by keywords
|
||||
matching_papers = []
|
||||
keywords_lower = [k.lower() for k in keywords]
|
||||
|
||||
for paper in papers:
|
||||
# Build search text from specified fields
|
||||
search_text = ""
|
||||
for field in search_fields:
|
||||
if field in paper:
|
||||
search_text += " " + str(paper[field]).lower()
|
||||
|
||||
# Check if any keyword matches
|
||||
if any(keyword in search_text for keyword in keywords_lower):
|
||||
matching_papers.append(paper)
|
||||
|
||||
self._log(f"Found {len(matching_papers)} papers matching keywords")
|
||||
return matching_papers
|
||||
|
||||
def download_pdf(self, doi: str, output_path: str) -> bool:
|
||||
"""
|
||||
Download the PDF of a paper.
|
||||
|
||||
Args:
|
||||
doi: The DOI of the paper
|
||||
output_path: Path where PDF should be saved
|
||||
|
||||
Returns:
|
||||
True if download successful, False otherwise
|
||||
"""
|
||||
# Clean DOI
|
||||
if 'doi.org' in doi:
|
||||
doi = doi.split('doi.org/')[-1]
|
||||
|
||||
# Construct PDF URL
|
||||
pdf_url = f"https://www.biorxiv.org/content/{doi}v1.full.pdf"
|
||||
|
||||
self._log(f"Downloading PDF from: {pdf_url}")
|
||||
|
||||
try:
|
||||
response = self.session.get(pdf_url, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
|
||||
self._log(f"PDF saved to: {output_path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
self._log(f"Error downloading PDF: {e}")
|
||||
return False
|
||||
|
||||
def format_result(self, paper: Dict, include_abstract: bool = True) -> Dict:
|
||||
"""
|
||||
Format a paper result with standardized fields.
|
||||
|
||||
Args:
|
||||
paper: Raw paper dictionary from API
|
||||
include_abstract: Whether to include the abstract
|
||||
|
||||
Returns:
|
||||
Formatted paper dictionary
|
||||
"""
|
||||
result = {
|
||||
"doi": paper.get("doi", ""),
|
||||
"title": paper.get("title", ""),
|
||||
"authors": paper.get("authors", ""),
|
||||
"author_corresponding": paper.get("author_corresponding", ""),
|
||||
"author_corresponding_institution": paper.get("author_corresponding_institution", ""),
|
||||
"date": paper.get("date", ""),
|
||||
"version": paper.get("version", ""),
|
||||
"type": paper.get("type", ""),
|
||||
"license": paper.get("license", ""),
|
||||
"category": paper.get("category", ""),
|
||||
"jatsxml": paper.get("jatsxml", ""),
|
||||
"published": paper.get("published", "")
|
||||
}
|
||||
|
||||
if include_abstract:
|
||||
result["abstract"] = paper.get("abstract", "")
|
||||
|
||||
# Add PDF and HTML URLs
|
||||
if result["doi"]:
|
||||
result["pdf_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}.full.pdf"
|
||||
result["html_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface for bioRxiv search."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Search bioRxiv preprints efficiently",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument("--verbose", "-v", action="store_true",
|
||||
help="Enable verbose logging")
|
||||
|
||||
# Search type arguments
|
||||
search_group = parser.add_argument_group("Search options")
|
||||
search_group.add_argument("--keywords", "-k", nargs="+",
|
||||
help="Keywords to search for")
|
||||
search_group.add_argument("--author", "-a",
|
||||
help="Author name to search for")
|
||||
search_group.add_argument("--doi",
|
||||
help="Get details for specific DOI")
|
||||
|
||||
# Date range arguments
|
||||
date_group = parser.add_argument_group("Date range options")
|
||||
date_group.add_argument("--start-date",
|
||||
help="Start date (YYYY-MM-DD)")
|
||||
date_group.add_argument("--end-date",
|
||||
help="End date (YYYY-MM-DD)")
|
||||
date_group.add_argument("--days-back", type=int,
|
||||
help="Search N days back from today")
|
||||
|
||||
# Filter arguments
|
||||
filter_group = parser.add_argument_group("Filter options")
|
||||
filter_group.add_argument("--category", "-c",
|
||||
choices=BioRxivSearcher.CATEGORIES,
|
||||
help="Filter by category")
|
||||
filter_group.add_argument("--search-fields", nargs="+",
|
||||
default=["title", "abstract"],
|
||||
choices=["title", "abstract", "authors"],
|
||||
help="Fields to search in for keywords")
|
||||
|
||||
# Output arguments
|
||||
output_group = parser.add_argument_group("Output options")
|
||||
output_group.add_argument("--output", "-o",
|
||||
help="Output file (default: stdout)")
|
||||
output_group.add_argument("--include-abstract", action="store_true",
|
||||
default=True, help="Include abstracts in output")
|
||||
output_group.add_argument("--download-pdf",
|
||||
help="Download PDF to specified path (requires --doi)")
|
||||
output_group.add_argument("--limit", type=int,
|
||||
help="Limit number of results")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize searcher
|
||||
searcher = BioRxivSearcher(verbose=args.verbose)
|
||||
|
||||
# Handle date range
|
||||
end_date = args.end_date or datetime.now().strftime("%Y-%m-%d")
|
||||
if args.days_back:
|
||||
start_date = (datetime.now() - timedelta(days=args.days_back)).strftime("%Y-%m-%d")
|
||||
else:
|
||||
start_date = args.start_date
|
||||
|
||||
# Execute search based on arguments
|
||||
results = []
|
||||
|
||||
if args.download_pdf:
|
||||
if not args.doi:
|
||||
print("Error: --doi required with --download-pdf", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
success = searcher.download_pdf(args.doi, args.download_pdf)
|
||||
return 0 if success else 1
|
||||
|
||||
elif args.doi:
|
||||
# Get specific paper by DOI
|
||||
paper = searcher.get_paper_details(args.doi)
|
||||
if paper:
|
||||
results = [paper]
|
||||
|
||||
elif args.author:
|
||||
# Search by author
|
||||
results = searcher.search_by_author(
|
||||
args.author, start_date, end_date
|
||||
)
|
||||
|
||||
elif args.keywords:
|
||||
# Search by keywords
|
||||
if not start_date:
|
||||
print("Error: --start-date or --days-back required for keyword search",
|
||||
file=sys.stderr)
|
||||
return 1
|
||||
|
||||
results = searcher.search_by_keywords(
|
||||
args.keywords, start_date, end_date,
|
||||
args.category, args.search_fields
|
||||
)
|
||||
|
||||
else:
|
||||
# Date range search
|
||||
if not start_date:
|
||||
print("Error: Must specify search criteria (--keywords, --author, or --doi)",
|
||||
file=sys.stderr)
|
||||
return 1
|
||||
|
||||
results = searcher.search_by_date_range(
|
||||
start_date, end_date, args.category
|
||||
)
|
||||
|
||||
# Apply limit
|
||||
if args.limit:
|
||||
results = results[:args.limit]
|
||||
|
||||
# Format results
|
||||
formatted_results = [
|
||||
searcher.format_result(paper, args.include_abstract)
|
||||
for paper in results
|
||||
]
|
||||
|
||||
# Output results
|
||||
output_data = {
|
||||
"query": {
|
||||
"keywords": args.keywords,
|
||||
"author": args.author,
|
||||
"doi": args.doi,
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"category": args.category
|
||||
},
|
||||
"result_count": len(formatted_results),
|
||||
"results": formatted_results
|
||||
}
|
||||
|
||||
output_json = json.dumps(output_data, indent=2)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, 'w') as f:
|
||||
f.write(output_json)
|
||||
print(f"Results written to {args.output}", file=sys.stderr)
|
||||
else:
|
||||
print(output_json)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user