Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/biorxiv-database/scripts/biorxiv_search.py
+++ b/skills/biorxiv-database/scripts/biorxiv_search.py
@@ -0,0 +1,445 @@
+#!/usr/bin/env python3
+"""
+bioRxiv Search Tool
+A comprehensive Python tool for searching and retrieving preprints from bioRxiv.
+Supports keyword search, author search, date filtering, category filtering, and more.
+
+Note: This tool is focused exclusively on bioRxiv (life sciences preprints).
+"""
+
+import requests
+import json
+import argparse
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional, Any
+import time
+import sys
+from urllib.parse import quote
+
+
+class BioRxivSearcher:
+    """Efficient search interface for bioRxiv preprints."""
+
+    BASE_URL = "https://api.biorxiv.org"
+
+    # Valid bioRxiv categories
+    CATEGORIES = [
+        "animal-behavior-and-cognition", "biochemistry", "bioengineering",
+        "bioinformatics", "biophysics", "cancer-biology", "cell-biology",
+        "clinical-trials", "developmental-biology", "ecology", "epidemiology",
+        "evolutionary-biology", "genetics", "genomics", "immunology",
+        "microbiology", "molecular-biology", "neuroscience", "paleontology",
+        "pathology", "pharmacology-and-toxicology", "physiology",
+        "plant-biology", "scientific-communication-and-education",
+        "synthetic-biology", "systems-biology", "zoology"
+    ]
+
+    def __init__(self, verbose: bool = False):
+        """Initialize the searcher."""
+        self.verbose = verbose
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'BioRxiv-Search-Tool/1.0'
+        })
+
+    def _log(self, message: str):
+        """Print verbose logging messages."""
+        if self.verbose:
+            print(f"[INFO] {message}", file=sys.stderr)
+
+    def _make_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
+        """Make an API request with error handling and rate limiting."""
+        url = f"{self.BASE_URL}/{endpoint}"
+        self._log(f"Requesting: {url}")
+
+        try:
+            response = self.session.get(url, params=params, timeout=30)
+            response.raise_for_status()
+
+            # Rate limiting - be respectful to the API
+            time.sleep(0.5)
+
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            self._log(f"Error making request: {e}")
+            return {"messages": [{"status": "error", "message": str(e)}], "collection": []}
+
+    def search_by_date_range(
+        self,
+        start_date: str,
+        end_date: str,
+        category: Optional[str] = None
+    ) -> List[Dict]:
+        """
+        Search for preprints within a date range.
+
+        Args:
+            start_date: Start date in YYYY-MM-DD format
+            end_date: End date in YYYY-MM-DD format
+            category: Optional category filter (e.g., 'neuroscience')
+
+        Returns:
+            List of preprint dictionaries
+        """
+        self._log(f"Searching bioRxiv from {start_date} to {end_date}")
+
+        if category:
+            endpoint = f"details/biorxiv/{start_date}/{end_date}/{category}"
+        else:
+            endpoint = f"details/biorxiv/{start_date}/{end_date}"
+
+        data = self._make_request(endpoint)
+
+        if "collection" in data:
+            self._log(f"Found {len(data['collection'])} preprints")
+            return data["collection"]
+
+        return []
+
+    def search_by_interval(
+        self,
+        interval: str = "1",
+        cursor: int = 0,
+        format: str = "json"
+    ) -> Dict:
+        """
+        Retrieve preprints from a specific time interval.
+
+        Args:
+            interval: Number of days back to search
+            cursor: Pagination cursor (0 for first page, then use returned cursor)
+            format: Response format ('json' or 'xml')
+
+        Returns:
+            Dictionary with collection and pagination info
+        """
+        endpoint = f"pubs/biorxiv/{interval}/{cursor}/{format}"
+        return self._make_request(endpoint)
+
+    def get_paper_details(self, doi: str) -> Dict:
+        """
+        Get detailed information about a specific paper by DOI.
+
+        Args:
+            doi: The DOI of the paper (e.g., '10.1101/2021.01.01.123456')
+
+        Returns:
+            Dictionary with paper details
+        """
+        # Clean DOI if full URL was provided
+        if 'doi.org' in doi:
+            doi = doi.split('doi.org/')[-1]
+
+        self._log(f"Fetching details for DOI: {doi}")
+        endpoint = f"details/biorxiv/{doi}"
+
+        data = self._make_request(endpoint)
+
+        if "collection" in data and len(data["collection"]) > 0:
+            return data["collection"][0]
+
+        return {}
+
+    def search_by_author(
+        self,
+        author_name: str,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None
+    ) -> List[Dict]:
+        """
+        Search for papers by author name.
+
+        Args:
+            author_name: Author name to search for
+            start_date: Optional start date (YYYY-MM-DD)
+            end_date: Optional end date (YYYY-MM-DD)
+
+        Returns:
+            List of matching preprints
+        """
+        # If no date range specified, search last 3 years
+        if not start_date:
+            end_date = datetime.now().strftime("%Y-%m-%d")
+            start_date = (datetime.now() - timedelta(days=1095)).strftime("%Y-%m-%d")
+
+        self._log(f"Searching for author: {author_name}")
+
+        # Get all papers in date range
+        papers = self.search_by_date_range(start_date, end_date)
+
+        # Filter by author name (case-insensitive)
+        author_lower = author_name.lower()
+        matching_papers = []
+
+        for paper in papers:
+            authors = paper.get("authors", "")
+            if author_lower in authors.lower():
+                matching_papers.append(paper)
+
+        self._log(f"Found {len(matching_papers)} papers by {author_name}")
+        return matching_papers
+
+    def search_by_keywords(
+        self,
+        keywords: List[str],
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None,
+        category: Optional[str] = None,
+        search_fields: List[str] = ["title", "abstract"]
+    ) -> List[Dict]:
+        """
+        Search for papers containing specific keywords.
+
+        Args:
+            keywords: List of keywords to search for
+            start_date: Optional start date (YYYY-MM-DD)
+            end_date: Optional end date (YYYY-MM-DD)
+            category: Optional category filter
+            search_fields: Fields to search in (title, abstract, authors)
+
+        Returns:
+            List of matching preprints
+        """
+        # If no date range specified, search last year
+        if not start_date:
+            end_date = datetime.now().strftime("%Y-%m-%d")
+            start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
+
+        self._log(f"Searching for keywords: {keywords}")
+
+        # Get all papers in date range
+        papers = self.search_by_date_range(start_date, end_date, category)
+
+        # Filter by keywords
+        matching_papers = []
+        keywords_lower = [k.lower() for k in keywords]
+
+        for paper in papers:
+            # Build search text from specified fields
+            search_text = ""
+            for field in search_fields:
+                if field in paper:
+                    search_text += " " + str(paper[field]).lower()
+
+            # Check if any keyword matches
+            if any(keyword in search_text for keyword in keywords_lower):
+                matching_papers.append(paper)
+
+        self._log(f"Found {len(matching_papers)} papers matching keywords")
+        return matching_papers
+
+    def download_pdf(self, doi: str, output_path: str) -> bool:
+        """
+        Download the PDF of a paper.
+
+        Args:
+            doi: The DOI of the paper
+            output_path: Path where PDF should be saved
+
+        Returns:
+            True if download successful, False otherwise
+        """
+        # Clean DOI
+        if 'doi.org' in doi:
+            doi = doi.split('doi.org/')[-1]
+
+        # Construct PDF URL
+        pdf_url = f"https://www.biorxiv.org/content/{doi}v1.full.pdf"
+
+        self._log(f"Downloading PDF from: {pdf_url}")
+
+        try:
+            response = self.session.get(pdf_url, timeout=60)
+            response.raise_for_status()
+
+            with open(output_path, 'wb') as f:
+                f.write(response.content)
+
+            self._log(f"PDF saved to: {output_path}")
+            return True
+        except Exception as e:
+            self._log(f"Error downloading PDF: {e}")
+            return False
+
+    def format_result(self, paper: Dict, include_abstract: bool = True) -> Dict:
+        """
+        Format a paper result with standardized fields.
+
+        Args:
+            paper: Raw paper dictionary from API
+            include_abstract: Whether to include the abstract
+
+        Returns:
+            Formatted paper dictionary
+        """
+        result = {
+            "doi": paper.get("doi", ""),
+            "title": paper.get("title", ""),
+            "authors": paper.get("authors", ""),
+            "author_corresponding": paper.get("author_corresponding", ""),
+            "author_corresponding_institution": paper.get("author_corresponding_institution", ""),
+            "date": paper.get("date", ""),
+            "version": paper.get("version", ""),
+            "type": paper.get("type", ""),
+            "license": paper.get("license", ""),
+            "category": paper.get("category", ""),
+            "jatsxml": paper.get("jatsxml", ""),
+            "published": paper.get("published", "")
+        }
+
+        if include_abstract:
+            result["abstract"] = paper.get("abstract", "")
+
+        # Add PDF and HTML URLs
+        if result["doi"]:
+            result["pdf_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}.full.pdf"
+            result["html_url"] = f"https://www.biorxiv.org/content/{result['doi']}v{result['version']}"
+
+        return result
+
+
+def main():
+    """Command-line interface for bioRxiv search."""
+    parser = argparse.ArgumentParser(
+        description="Search bioRxiv preprints efficiently",
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+
+    parser.add_argument("--verbose", "-v", action="store_true",
+                       help="Enable verbose logging")
+
+    # Search type arguments
+    search_group = parser.add_argument_group("Search options")
+    search_group.add_argument("--keywords", "-k", nargs="+",
+                            help="Keywords to search for")
+    search_group.add_argument("--author", "-a",
+                            help="Author name to search for")
+    search_group.add_argument("--doi",
+                            help="Get details for specific DOI")
+
+    # Date range arguments
+    date_group = parser.add_argument_group("Date range options")
+    date_group.add_argument("--start-date",
+                          help="Start date (YYYY-MM-DD)")
+    date_group.add_argument("--end-date",
+                          help="End date (YYYY-MM-DD)")
+    date_group.add_argument("--days-back", type=int,
+                          help="Search N days back from today")
+
+    # Filter arguments
+    filter_group = parser.add_argument_group("Filter options")
+    filter_group.add_argument("--category", "-c",
+                            choices=BioRxivSearcher.CATEGORIES,
+                            help="Filter by category")
+    filter_group.add_argument("--search-fields", nargs="+",
+                            default=["title", "abstract"],
+                            choices=["title", "abstract", "authors"],
+                            help="Fields to search in for keywords")
+
+    # Output arguments
+    output_group = parser.add_argument_group("Output options")
+    output_group.add_argument("--output", "-o",
+                            help="Output file (default: stdout)")
+    output_group.add_argument("--include-abstract", action="store_true",
+                            default=True, help="Include abstracts in output")
+    output_group.add_argument("--download-pdf",
+                            help="Download PDF to specified path (requires --doi)")
+    output_group.add_argument("--limit", type=int,
+                            help="Limit number of results")
+
+    args = parser.parse_args()
+
+    # Initialize searcher
+    searcher = BioRxivSearcher(verbose=args.verbose)
+
+    # Handle date range
+    end_date = args.end_date or datetime.now().strftime("%Y-%m-%d")
+    if args.days_back:
+        start_date = (datetime.now() - timedelta(days=args.days_back)).strftime("%Y-%m-%d")
+    else:
+        start_date = args.start_date
+
+    # Execute search based on arguments
+    results = []
+
+    if args.download_pdf:
+        if not args.doi:
+            print("Error: --doi required with --download-pdf", file=sys.stderr)
+            return 1
+
+        success = searcher.download_pdf(args.doi, args.download_pdf)
+        return 0 if success else 1
+
+    elif args.doi:
+        # Get specific paper by DOI
+        paper = searcher.get_paper_details(args.doi)
+        if paper:
+            results = [paper]
+
+    elif args.author:
+        # Search by author
+        results = searcher.search_by_author(
+            args.author, start_date, end_date
+        )
+
+    elif args.keywords:
+        # Search by keywords
+        if not start_date:
+            print("Error: --start-date or --days-back required for keyword search",
+                  file=sys.stderr)
+            return 1
+
+        results = searcher.search_by_keywords(
+            args.keywords, start_date, end_date,
+            args.category, args.search_fields
+        )
+
+    else:
+        # Date range search
+        if not start_date:
+            print("Error: Must specify search criteria (--keywords, --author, or --doi)",
+                  file=sys.stderr)
+            return 1
+
+        results = searcher.search_by_date_range(
+            start_date, end_date, args.category
+        )
+
+    # Apply limit
+    if args.limit:
+        results = results[:args.limit]
+
+    # Format results
+    formatted_results = [
+        searcher.format_result(paper, args.include_abstract)
+        for paper in results
+    ]
+
+    # Output results
+    output_data = {
+        "query": {
+            "keywords": args.keywords,
+            "author": args.author,
+            "doi": args.doi,
+            "start_date": start_date,
+            "end_date": end_date,
+            "category": args.category
+        },
+        "result_count": len(formatted_results),
+        "results": formatted_results
+    }
+
+    output_json = json.dumps(output_data, indent=2)
+
+    if args.output:
+        with open(args.output, 'w') as f:
+            f.write(output_json)
+        print(f"Results written to {args.output}", file=sys.stderr)
+    else:
+        print(output_json)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())