Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/openalex-database/scripts/openalex_client.py
+++ b/skills/openalex-database/scripts/openalex_client.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+"""
+OpenAlex API Client with rate limiting and error handling.
+
+Provides a robust client for interacting with the OpenAlex API with:
+- Automatic rate limiting (polite pool: 10 req/sec)
+- Exponential backoff retry logic
+- Pagination support
+- Batch operations support
+"""
+
+import time
+import requests
+from typing import Dict, List, Optional, Any
+from urllib.parse import urljoin
+
+
+class OpenAlexClient:
+    """Client for OpenAlex API with rate limiting and error handling."""
+
+    BASE_URL = "https://api.openalex.org"
+
+    def __init__(self, email: Optional[str] = None, requests_per_second: int = 10):
+        """
+        Initialize OpenAlex client.
+
+        Args:
+            email: Email for polite pool (10x rate limit boost)
+            requests_per_second: Max requests per second (default: 10 for polite pool)
+        """
+        self.email = email
+        self.requests_per_second = requests_per_second
+        self.min_delay = 1.0 / requests_per_second
+        self.last_request_time = 0
+
+    def _rate_limit(self):
+        """Ensure requests don't exceed rate limit."""
+        current_time = time.time()
+        time_since_last = current_time - self.last_request_time
+        if time_since_last < self.min_delay:
+            time.sleep(self.min_delay - time_since_last)
+        self.last_request_time = time.time()
+
+    def _make_request(
+        self,
+        endpoint: str,
+        params: Optional[Dict] = None,
+        max_retries: int = 5
+    ) -> Dict[str, Any]:
+        """
+        Make API request with retry logic.
+
+        Args:
+            endpoint: API endpoint (e.g., '/works', '/authors')
+            params: Query parameters
+            max_retries: Maximum number of retry attempts
+
+        Returns:
+            JSON response as dictionary
+        """
+        if params is None:
+            params = {}
+
+        # Add email to params for polite pool
+        if self.email:
+            params['mailto'] = self.email
+
+        url = urljoin(self.BASE_URL, endpoint)
+
+        for attempt in range(max_retries):
+            try:
+                self._rate_limit()
+                response = requests.get(url, params=params, timeout=30)
+
+                if response.status_code == 200:
+                    return response.json()
+                elif response.status_code == 403:
+                    # Rate limited
+                    wait_time = 2 ** attempt
+                    print(f"Rate limited. Waiting {wait_time}s before retry...")
+                    time.sleep(wait_time)
+                elif response.status_code >= 500:
+                    # Server error
+                    wait_time = 2 ** attempt
+                    print(f"Server error. Waiting {wait_time}s before retry...")
+                    time.sleep(wait_time)
+                else:
+                    # Other error - don't retry
+                    response.raise_for_status()
+
+            except requests.exceptions.Timeout:
+                if attempt < max_retries - 1:
+                    wait_time = 2 ** attempt
+                    print(f"Request timeout. Waiting {wait_time}s before retry...")
+                    time.sleep(wait_time)
+                else:
+                    raise
+
+        raise Exception(f"Failed after {max_retries} retries")
+
+    def search_works(
+        self,
+        search: Optional[str] = None,
+        filter_params: Optional[Dict] = None,
+        per_page: int = 200,
+        page: int = 1,
+        sort: Optional[str] = None,
+        select: Optional[List[str]] = None
+    ) -> Dict[str, Any]:
+        """
+        Search works with filters.
+
+        Args:
+            search: Full-text search query
+            filter_params: Dictionary of filter parameters
+            per_page: Results per page (max: 200)
+            page: Page number
+            sort: Sort parameter (e.g., 'cited_by_count:desc')
+            select: List of fields to return
+
+        Returns:
+            API response with meta and results
+        """
+        params = {
+            'per-page': min(per_page, 200),
+            'page': page
+        }
+
+        if search:
+            params['search'] = search
+
+        if filter_params:
+            filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
+            params['filter'] = filter_str
+
+        if sort:
+            params['sort'] = sort
+
+        if select:
+            params['select'] = ','.join(select)
+
+        return self._make_request('/works', params)
+
+    def get_entity(self, entity_type: str, entity_id: str) -> Dict[str, Any]:
+        """
+        Get single entity by ID.
+
+        Args:
+            entity_type: Type of entity ('works', 'authors', 'institutions', etc.)
+            entity_id: OpenAlex ID or external ID (DOI, ORCID, etc.)
+
+        Returns:
+            Entity object
+        """
+        endpoint = f"/{entity_type}/{entity_id}"
+        return self._make_request(endpoint)
+
+    def batch_lookup(
+        self,
+        entity_type: str,
+        ids: List[str],
+        id_field: str = 'openalex_id'
+    ) -> List[Dict[str, Any]]:
+        """
+        Look up multiple entities by ID efficiently.
+
+        Args:
+            entity_type: Type of entity ('works', 'authors', etc.)
+            ids: List of IDs (up to 50 per batch)
+            id_field: ID field name ('openalex_id', 'doi', 'orcid', etc.)
+
+        Returns:
+            List of entity objects
+        """
+        all_results = []
+
+        # Process in batches of 50
+        for i in range(0, len(ids), 50):
+            batch = ids[i:i+50]
+            filter_value = '|'.join(batch)
+
+            params = {
+                'filter': f"{id_field}:{filter_value}",
+                'per-page': 50
+            }
+
+            response = self._make_request(f"/{entity_type}", params)
+            all_results.extend(response.get('results', []))
+
+        return all_results
+
+    def paginate_all(
+        self,
+        endpoint: str,
+        params: Optional[Dict] = None,
+        max_results: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Paginate through all results.
+
+        Args:
+            endpoint: API endpoint
+            params: Query parameters
+            max_results: Maximum number of results to retrieve (None for all)
+
+        Returns:
+            List of all results
+        """
+        if params is None:
+            params = {}
+
+        params['per-page'] = 200  # Use maximum page size
+        params['page'] = 1
+
+        all_results = []
+
+        while True:
+            response = self._make_request(endpoint, params)
+            results = response.get('results', [])
+            all_results.extend(results)
+
+            # Check if we've hit max_results
+            if max_results and len(all_results) >= max_results:
+                return all_results[:max_results]
+
+            # Check if there are more pages
+            meta = response.get('meta', {})
+            total_count = meta.get('count', 0)
+            current_count = len(all_results)
+
+            if current_count >= total_count:
+                break
+
+            params['page'] += 1
+
+        return all_results
+
+    def sample_works(
+        self,
+        sample_size: int,
+        seed: Optional[int] = None,
+        filter_params: Optional[Dict] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Get random sample of works.
+
+        Args:
+            sample_size: Number of samples to retrieve
+            seed: Random seed for reproducibility
+            filter_params: Optional filters to apply
+
+        Returns:
+            List of sampled works
+        """
+        params = {
+            'sample': min(sample_size, 10000),  # API limit per request
+            'per-page': 200
+        }
+
+        if seed is not None:
+            params['seed'] = seed
+
+        if filter_params:
+            filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
+            params['filter'] = filter_str
+
+        # For large samples, need multiple requests with different seeds
+        if sample_size > 10000:
+            all_samples = []
+            seen_ids = set()
+
+            for i in range((sample_size // 10000) + 1):
+                current_seed = seed + i if seed else i
+                params['seed'] = current_seed
+                params['sample'] = min(10000, sample_size - len(all_samples))
+
+                response = self._make_request('/works', params)
+                results = response.get('results', [])
+
+                # Deduplicate
+                for result in results:
+                    work_id = result.get('id')
+                    if work_id not in seen_ids:
+                        seen_ids.add(work_id)
+                        all_samples.append(result)
+
+                if len(all_samples) >= sample_size:
+                    break
+
+            return all_samples[:sample_size]
+        else:
+            response = self._make_request('/works', params)
+            return response.get('results', [])
+
+    def group_by(
+        self,
+        entity_type: str,
+        group_field: str,
+        filter_params: Optional[Dict] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Aggregate results by field.
+
+        Args:
+            entity_type: Type of entity ('works', 'authors', etc.)
+            group_field: Field to group by
+            filter_params: Optional filters
+
+        Returns:
+            List of grouped results with counts
+        """
+        params = {
+            'group_by': group_field
+        }
+
+        if filter_params:
+            filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
+            params['filter'] = filter_str
+
+        response = self._make_request(f"/{entity_type}", params)
+        return response.get('group_by', [])
+
+
+if __name__ == "__main__":
+    # Example usage
+    client = OpenAlexClient(email="your-email@example.com")
+
+    # Search for works about machine learning
+    results = client.search_works(
+        search="machine learning",
+        filter_params={"publication_year": "2023"},
+        per_page=10
+    )
+
+    print(f"Found {results['meta']['count']} works")
+    for work in results['results']:
+        print(f"- {work['title']}")
--- a/skills/openalex-database/scripts/query_helpers.py
+++ b/skills/openalex-database/scripts/query_helpers.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+Helper functions for common OpenAlex query patterns.
+
+Provides high-level functions for typical research queries.
+"""
+
+from typing import List, Dict, Optional, Any
+from openalex_client import OpenAlexClient
+
+
+def find_author_works(
+    author_name: str,
+    client: OpenAlexClient,
+    limit: Optional[int] = None
+) -> List[Dict[str, Any]]:
+    """
+    Find all works by an author (two-step pattern).
+
+    Args:
+        author_name: Author name to search for
+        client: OpenAlexClient instance
+        limit: Maximum number of works to return
+
+    Returns:
+        List of works by the author
+    """
+    # Step 1: Find author ID
+    author_response = client._make_request(
+        '/authors',
+        params={'search': author_name, 'per-page': 1}
+    )
+
+    if not author_response.get('results'):
+        print(f"No author found for: {author_name}")
+        return []
+
+    author = author_response['results'][0]
+    author_id = author['id'].split('/')[-1]  # Extract ID from URL
+
+    print(f"Found author: {author['display_name']} (ID: {author_id})")
+
+    # Step 2: Get works by author
+    works_params = {
+        'filter': f'authorships.author.id:{author_id}',
+        'per-page': 200
+    }
+
+    if limit and limit <= 200:
+        works_params['per-page'] = limit
+        response = client._make_request('/works', works_params)
+        return response.get('results', [])
+    else:
+        # Need pagination
+        return client.paginate_all('/works', works_params, max_results=limit)
+
+
+def find_institution_works(
+    institution_name: str,
+    client: OpenAlexClient,
+    limit: Optional[int] = None
+) -> List[Dict[str, Any]]:
+    """
+    Find all works from an institution (two-step pattern).
+
+    Args:
+        institution_name: Institution name to search for
+        client: OpenAlexClient instance
+        limit: Maximum number of works to return
+
+    Returns:
+        List of works from the institution
+    """
+    # Step 1: Find institution ID
+    inst_response = client._make_request(
+        '/institutions',
+        params={'search': institution_name, 'per-page': 1}
+    )
+
+    if not inst_response.get('results'):
+        print(f"No institution found for: {institution_name}")
+        return []
+
+    institution = inst_response['results'][0]
+    inst_id = institution['id'].split('/')[-1]  # Extract ID from URL
+
+    print(f"Found institution: {institution['display_name']} (ID: {inst_id})")
+
+    # Step 2: Get works from institution
+    works_params = {
+        'filter': f'authorships.institutions.id:{inst_id}',
+        'per-page': 200
+    }
+
+    if limit and limit <= 200:
+        works_params['per-page'] = limit
+        response = client._make_request('/works', works_params)
+        return response.get('results', [])
+    else:
+        return client.paginate_all('/works', works_params, max_results=limit)
+
+
+def find_highly_cited_recent_papers(
+    topic: Optional[str] = None,
+    years: str = ">2020",
+    client: Optional[OpenAlexClient] = None,
+    limit: int = 100
+) -> List[Dict[str, Any]]:
+    """
+    Find highly cited recent papers, optionally filtered by topic.
+
+    Args:
+        topic: Optional search term for topic filtering
+        years: Year filter (e.g., ">2020", "2020-2023")
+        client: OpenAlexClient instance
+        limit: Maximum number of papers to return
+
+    Returns:
+        List of highly cited papers sorted by citation count
+    """
+    if client is None:
+        client = OpenAlexClient()
+
+    params = {
+        'filter': f'publication_year:{years}',
+        'sort': 'cited_by_count:desc',
+        'per-page': min(limit, 200)
+    }
+
+    if topic:
+        params['search'] = topic
+
+    if limit <= 200:
+        response = client._make_request('/works', params)
+        return response.get('results', [])
+    else:
+        return client.paginate_all('/works', params, max_results=limit)
+
+
+def get_open_access_papers(
+    search_term: str,
+    client: OpenAlexClient,
+    oa_status: str = "any",  # "any", "gold", "green", "hybrid", "bronze"
+    limit: int = 100
+) -> List[Dict[str, Any]]:
+    """
+    Find open access papers on a topic.
+
+    Args:
+        search_term: Search query
+        client: OpenAlexClient instance
+        oa_status: Type of OA ("any" for is_oa:true, or specific status)
+        limit: Maximum number of papers to return
+
+    Returns:
+        List of open access papers
+    """
+    if oa_status == "any":
+        filter_str = "is_oa:true"
+    else:
+        filter_str = f"open_access.oa_status:{oa_status}"
+
+    params = {
+        'search': search_term,
+        'filter': filter_str,
+        'per-page': min(limit, 200)
+    }
+
+    if limit <= 200:
+        response = client._make_request('/works', params)
+        return response.get('results', [])
+    else:
+        return client.paginate_all('/works', params, max_results=limit)
+
+
+def get_publication_trends(
+    search_term: Optional[str] = None,
+    filter_params: Optional[Dict] = None,
+    client: Optional[OpenAlexClient] = None
+) -> List[Dict[str, Any]]:
+    """
+    Get publication counts by year.
+
+    Args:
+        search_term: Optional search query
+        filter_params: Optional additional filters
+        client: OpenAlexClient instance
+
+    Returns:
+        List of {year, count} dictionaries
+    """
+    if client is None:
+        client = OpenAlexClient()
+
+    params = {'group_by': 'publication_year'}
+
+    if search_term:
+        params['search'] = search_term
+
+    if filter_params:
+        filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
+        params['filter'] = filter_str
+
+    response = client._make_request('/works', params)
+    return response.get('group_by', [])
+
+
+def analyze_research_output(
+    entity_type: str,  # 'author' or 'institution'
+    entity_name: str,
+    client: OpenAlexClient,
+    years: str = ">2020"
+) -> Dict[str, Any]:
+    """
+    Analyze research output for an author or institution.
+
+    Args:
+        entity_type: 'author' or 'institution'
+        entity_name: Name to search for
+        client: OpenAlexClient instance
+        years: Year filter
+
+    Returns:
+        Dictionary with analysis results
+    """
+    # Find entity ID
+    if entity_type == 'author':
+        endpoint = '/authors'
+        filter_prefix = 'authorships.author.id'
+    else:
+        endpoint = '/institutions'
+        filter_prefix = 'authorships.institutions.id'
+
+    # Step 1: Find entity
+    entity_response = client._make_request(
+        endpoint,
+        params={'search': entity_name, 'per-page': 1}
+    )
+
+    if not entity_response.get('results'):
+        return {'error': f'No {entity_type} found for: {entity_name}'}
+
+    entity = entity_response['results'][0]
+    entity_id = entity['id'].split('/')[-1]
+
+    # Step 2: Get statistics
+    filter_params = {
+        filter_prefix: entity_id,
+        'publication_year': years
+    }
+
+    # Total works
+    works_response = client.search_works(
+        filter_params=filter_params,
+        per_page=1
+    )
+    total_works = works_response['meta']['count']
+
+    # Works by year
+    trends = client.group_by(
+        'works',
+        'publication_year',
+        filter_params={filter_prefix: entity_id, 'publication_year': years}
+    )
+
+    # Top topics
+    topics = client.group_by(
+        'works',
+        'topics.id',
+        filter_params=filter_params
+    )
+
+    # OA percentage
+    oa_works = client.search_works(
+        filter_params={**filter_params, 'is_oa': 'true'},
+        per_page=1
+    )
+    oa_count = oa_works['meta']['count']
+    oa_percentage = (oa_count / total_works * 100) if total_works > 0 else 0
+
+    return {
+        'entity_name': entity['display_name'],
+        'entity_id': entity_id,
+        'total_works': total_works,
+        'open_access_works': oa_count,
+        'open_access_percentage': round(oa_percentage, 1),
+        'publications_by_year': trends[:10],  # Last 10 years
+        'top_topics': topics[:10]  # Top 10 topics
+    }
+
+
+if __name__ == "__main__":
+    # Example usage
+    import json
+
+    client = OpenAlexClient(email="your-email@example.com")
+
+    # Find works by author
+    print("\n=== Finding works by author ===")
+    works = find_author_works("Einstein", client, limit=5)
+    print(f"Found {len(works)} works")
+
+    # Analyze research output
+    print("\n=== Analyzing institution research output ===")
+    analysis = analyze_research_output('institution', 'MIT', client)
+    print(json.dumps(analysis, indent=2))