Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,337 @@
#!/usr/bin/env python3
"""
OpenAlex API Client with rate limiting and error handling.
Provides a robust client for interacting with the OpenAlex API with:
- Automatic rate limiting (polite pool: 10 req/sec)
- Exponential backoff retry logic
- Pagination support
- Batch operations support
"""
import time
import requests
from typing import Dict, List, Optional, Any
from urllib.parse import urljoin
class OpenAlexClient:
"""Client for OpenAlex API with rate limiting and error handling."""
BASE_URL = "https://api.openalex.org"
def __init__(self, email: Optional[str] = None, requests_per_second: int = 10):
"""
Initialize OpenAlex client.
Args:
email: Email for polite pool (10x rate limit boost)
requests_per_second: Max requests per second (default: 10 for polite pool)
"""
self.email = email
self.requests_per_second = requests_per_second
self.min_delay = 1.0 / requests_per_second
self.last_request_time = 0
def _rate_limit(self):
"""Ensure requests don't exceed rate limit."""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < self.min_delay:
time.sleep(self.min_delay - time_since_last)
self.last_request_time = time.time()
def _make_request(
self,
endpoint: str,
params: Optional[Dict] = None,
max_retries: int = 5
) -> Dict[str, Any]:
"""
Make API request with retry logic.
Args:
endpoint: API endpoint (e.g., '/works', '/authors')
params: Query parameters
max_retries: Maximum number of retry attempts
Returns:
JSON response as dictionary
"""
if params is None:
params = {}
# Add email to params for polite pool
if self.email:
params['mailto'] = self.email
url = urljoin(self.BASE_URL, endpoint)
for attempt in range(max_retries):
try:
self._rate_limit()
response = requests.get(url, params=params, timeout=30)
if response.status_code == 200:
return response.json()
elif response.status_code == 403:
# Rate limited
wait_time = 2 ** attempt
print(f"Rate limited. Waiting {wait_time}s before retry...")
time.sleep(wait_time)
elif response.status_code >= 500:
# Server error
wait_time = 2 ** attempt
print(f"Server error. Waiting {wait_time}s before retry...")
time.sleep(wait_time)
else:
# Other error - don't retry
response.raise_for_status()
except requests.exceptions.Timeout:
if attempt < max_retries - 1:
wait_time = 2 ** attempt
print(f"Request timeout. Waiting {wait_time}s before retry...")
time.sleep(wait_time)
else:
raise
raise Exception(f"Failed after {max_retries} retries")
def search_works(
self,
search: Optional[str] = None,
filter_params: Optional[Dict] = None,
per_page: int = 200,
page: int = 1,
sort: Optional[str] = None,
select: Optional[List[str]] = None
) -> Dict[str, Any]:
"""
Search works with filters.
Args:
search: Full-text search query
filter_params: Dictionary of filter parameters
per_page: Results per page (max: 200)
page: Page number
sort: Sort parameter (e.g., 'cited_by_count:desc')
select: List of fields to return
Returns:
API response with meta and results
"""
params = {
'per-page': min(per_page, 200),
'page': page
}
if search:
params['search'] = search
if filter_params:
filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
params['filter'] = filter_str
if sort:
params['sort'] = sort
if select:
params['select'] = ','.join(select)
return self._make_request('/works', params)
def get_entity(self, entity_type: str, entity_id: str) -> Dict[str, Any]:
"""
Get single entity by ID.
Args:
entity_type: Type of entity ('works', 'authors', 'institutions', etc.)
entity_id: OpenAlex ID or external ID (DOI, ORCID, etc.)
Returns:
Entity object
"""
endpoint = f"/{entity_type}/{entity_id}"
return self._make_request(endpoint)
def batch_lookup(
self,
entity_type: str,
ids: List[str],
id_field: str = 'openalex_id'
) -> List[Dict[str, Any]]:
"""
Look up multiple entities by ID efficiently.
Args:
entity_type: Type of entity ('works', 'authors', etc.)
ids: List of IDs (up to 50 per batch)
id_field: ID field name ('openalex_id', 'doi', 'orcid', etc.)
Returns:
List of entity objects
"""
all_results = []
# Process in batches of 50
for i in range(0, len(ids), 50):
batch = ids[i:i+50]
filter_value = '|'.join(batch)
params = {
'filter': f"{id_field}:{filter_value}",
'per-page': 50
}
response = self._make_request(f"/{entity_type}", params)
all_results.extend(response.get('results', []))
return all_results
def paginate_all(
self,
endpoint: str,
params: Optional[Dict] = None,
max_results: Optional[int] = None
) -> List[Dict[str, Any]]:
"""
Paginate through all results.
Args:
endpoint: API endpoint
params: Query parameters
max_results: Maximum number of results to retrieve (None for all)
Returns:
List of all results
"""
if params is None:
params = {}
params['per-page'] = 200 # Use maximum page size
params['page'] = 1
all_results = []
while True:
response = self._make_request(endpoint, params)
results = response.get('results', [])
all_results.extend(results)
# Check if we've hit max_results
if max_results and len(all_results) >= max_results:
return all_results[:max_results]
# Check if there are more pages
meta = response.get('meta', {})
total_count = meta.get('count', 0)
current_count = len(all_results)
if current_count >= total_count:
break
params['page'] += 1
return all_results
def sample_works(
self,
sample_size: int,
seed: Optional[int] = None,
filter_params: Optional[Dict] = None
) -> List[Dict[str, Any]]:
"""
Get random sample of works.
Args:
sample_size: Number of samples to retrieve
seed: Random seed for reproducibility
filter_params: Optional filters to apply
Returns:
List of sampled works
"""
params = {
'sample': min(sample_size, 10000), # API limit per request
'per-page': 200
}
if seed is not None:
params['seed'] = seed
if filter_params:
filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
params['filter'] = filter_str
# For large samples, need multiple requests with different seeds
if sample_size > 10000:
all_samples = []
seen_ids = set()
for i in range((sample_size // 10000) + 1):
current_seed = seed + i if seed else i
params['seed'] = current_seed
params['sample'] = min(10000, sample_size - len(all_samples))
response = self._make_request('/works', params)
results = response.get('results', [])
# Deduplicate
for result in results:
work_id = result.get('id')
if work_id not in seen_ids:
seen_ids.add(work_id)
all_samples.append(result)
if len(all_samples) >= sample_size:
break
return all_samples[:sample_size]
else:
response = self._make_request('/works', params)
return response.get('results', [])
def group_by(
self,
entity_type: str,
group_field: str,
filter_params: Optional[Dict] = None
) -> List[Dict[str, Any]]:
"""
Aggregate results by field.
Args:
entity_type: Type of entity ('works', 'authors', etc.)
group_field: Field to group by
filter_params: Optional filters
Returns:
List of grouped results with counts
"""
params = {
'group_by': group_field
}
if filter_params:
filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
params['filter'] = filter_str
response = self._make_request(f"/{entity_type}", params)
return response.get('group_by', [])
if __name__ == "__main__":
# Example usage
client = OpenAlexClient(email="your-email@example.com")
# Search for works about machine learning
results = client.search_works(
search="machine learning",
filter_params={"publication_year": "2023"},
per_page=10
)
print(f"Found {results['meta']['count']} works")
for work in results['results']:
print(f"- {work['title']}")

View File

@@ -0,0 +1,306 @@
#!/usr/bin/env python3
"""
Helper functions for common OpenAlex query patterns.
Provides high-level functions for typical research queries.
"""
from typing import List, Dict, Optional, Any
from openalex_client import OpenAlexClient
def find_author_works(
author_name: str,
client: OpenAlexClient,
limit: Optional[int] = None
) -> List[Dict[str, Any]]:
"""
Find all works by an author (two-step pattern).
Args:
author_name: Author name to search for
client: OpenAlexClient instance
limit: Maximum number of works to return
Returns:
List of works by the author
"""
# Step 1: Find author ID
author_response = client._make_request(
'/authors',
params={'search': author_name, 'per-page': 1}
)
if not author_response.get('results'):
print(f"No author found for: {author_name}")
return []
author = author_response['results'][0]
author_id = author['id'].split('/')[-1] # Extract ID from URL
print(f"Found author: {author['display_name']} (ID: {author_id})")
# Step 2: Get works by author
works_params = {
'filter': f'authorships.author.id:{author_id}',
'per-page': 200
}
if limit and limit <= 200:
works_params['per-page'] = limit
response = client._make_request('/works', works_params)
return response.get('results', [])
else:
# Need pagination
return client.paginate_all('/works', works_params, max_results=limit)
def find_institution_works(
institution_name: str,
client: OpenAlexClient,
limit: Optional[int] = None
) -> List[Dict[str, Any]]:
"""
Find all works from an institution (two-step pattern).
Args:
institution_name: Institution name to search for
client: OpenAlexClient instance
limit: Maximum number of works to return
Returns:
List of works from the institution
"""
# Step 1: Find institution ID
inst_response = client._make_request(
'/institutions',
params={'search': institution_name, 'per-page': 1}
)
if not inst_response.get('results'):
print(f"No institution found for: {institution_name}")
return []
institution = inst_response['results'][0]
inst_id = institution['id'].split('/')[-1] # Extract ID from URL
print(f"Found institution: {institution['display_name']} (ID: {inst_id})")
# Step 2: Get works from institution
works_params = {
'filter': f'authorships.institutions.id:{inst_id}',
'per-page': 200
}
if limit and limit <= 200:
works_params['per-page'] = limit
response = client._make_request('/works', works_params)
return response.get('results', [])
else:
return client.paginate_all('/works', works_params, max_results=limit)
def find_highly_cited_recent_papers(
topic: Optional[str] = None,
years: str = ">2020",
client: Optional[OpenAlexClient] = None,
limit: int = 100
) -> List[Dict[str, Any]]:
"""
Find highly cited recent papers, optionally filtered by topic.
Args:
topic: Optional search term for topic filtering
years: Year filter (e.g., ">2020", "2020-2023")
client: OpenAlexClient instance
limit: Maximum number of papers to return
Returns:
List of highly cited papers sorted by citation count
"""
if client is None:
client = OpenAlexClient()
params = {
'filter': f'publication_year:{years}',
'sort': 'cited_by_count:desc',
'per-page': min(limit, 200)
}
if topic:
params['search'] = topic
if limit <= 200:
response = client._make_request('/works', params)
return response.get('results', [])
else:
return client.paginate_all('/works', params, max_results=limit)
def get_open_access_papers(
search_term: str,
client: OpenAlexClient,
oa_status: str = "any", # "any", "gold", "green", "hybrid", "bronze"
limit: int = 100
) -> List[Dict[str, Any]]:
"""
Find open access papers on a topic.
Args:
search_term: Search query
client: OpenAlexClient instance
oa_status: Type of OA ("any" for is_oa:true, or specific status)
limit: Maximum number of papers to return
Returns:
List of open access papers
"""
if oa_status == "any":
filter_str = "is_oa:true"
else:
filter_str = f"open_access.oa_status:{oa_status}"
params = {
'search': search_term,
'filter': filter_str,
'per-page': min(limit, 200)
}
if limit <= 200:
response = client._make_request('/works', params)
return response.get('results', [])
else:
return client.paginate_all('/works', params, max_results=limit)
def get_publication_trends(
search_term: Optional[str] = None,
filter_params: Optional[Dict] = None,
client: Optional[OpenAlexClient] = None
) -> List[Dict[str, Any]]:
"""
Get publication counts by year.
Args:
search_term: Optional search query
filter_params: Optional additional filters
client: OpenAlexClient instance
Returns:
List of {year, count} dictionaries
"""
if client is None:
client = OpenAlexClient()
params = {'group_by': 'publication_year'}
if search_term:
params['search'] = search_term
if filter_params:
filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
params['filter'] = filter_str
response = client._make_request('/works', params)
return response.get('group_by', [])
def analyze_research_output(
entity_type: str, # 'author' or 'institution'
entity_name: str,
client: OpenAlexClient,
years: str = ">2020"
) -> Dict[str, Any]:
"""
Analyze research output for an author or institution.
Args:
entity_type: 'author' or 'institution'
entity_name: Name to search for
client: OpenAlexClient instance
years: Year filter
Returns:
Dictionary with analysis results
"""
# Find entity ID
if entity_type == 'author':
endpoint = '/authors'
filter_prefix = 'authorships.author.id'
else:
endpoint = '/institutions'
filter_prefix = 'authorships.institutions.id'
# Step 1: Find entity
entity_response = client._make_request(
endpoint,
params={'search': entity_name, 'per-page': 1}
)
if not entity_response.get('results'):
return {'error': f'No {entity_type} found for: {entity_name}'}
entity = entity_response['results'][0]
entity_id = entity['id'].split('/')[-1]
# Step 2: Get statistics
filter_params = {
filter_prefix: entity_id,
'publication_year': years
}
# Total works
works_response = client.search_works(
filter_params=filter_params,
per_page=1
)
total_works = works_response['meta']['count']
# Works by year
trends = client.group_by(
'works',
'publication_year',
filter_params={filter_prefix: entity_id, 'publication_year': years}
)
# Top topics
topics = client.group_by(
'works',
'topics.id',
filter_params=filter_params
)
# OA percentage
oa_works = client.search_works(
filter_params={**filter_params, 'is_oa': 'true'},
per_page=1
)
oa_count = oa_works['meta']['count']
oa_percentage = (oa_count / total_works * 100) if total_works > 0 else 0
return {
'entity_name': entity['display_name'],
'entity_id': entity_id,
'total_works': total_works,
'open_access_works': oa_count,
'open_access_percentage': round(oa_percentage, 1),
'publications_by_year': trends[:10], # Last 10 years
'top_topics': topics[:10] # Top 10 topics
}
if __name__ == "__main__":
# Example usage
import json
client = OpenAlexClient(email="your-email@example.com")
# Find works by author
print("\n=== Finding works by author ===")
works = find_author_works("Einstein", client, limit=5)
print(f"Found {len(works)} works")
# Analyze research output
print("\n=== Analyzing institution research output ===")
analysis = analyze_research_output('institution', 'MIT', client)
print(json.dumps(analysis, indent=2))