Initial commit
This commit is contained in:
337
skills/openalex-database/scripts/openalex_client.py
Normal file
337
skills/openalex-database/scripts/openalex_client.py
Normal file
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
OpenAlex API Client with rate limiting and error handling.
|
||||
|
||||
Provides a robust client for interacting with the OpenAlex API with:
|
||||
- Automatic rate limiting (polite pool: 10 req/sec)
|
||||
- Exponential backoff retry logic
|
||||
- Pagination support
|
||||
- Batch operations support
|
||||
"""
|
||||
|
||||
import time
|
||||
import requests
|
||||
from typing import Dict, List, Optional, Any
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class OpenAlexClient:
|
||||
"""Client for OpenAlex API with rate limiting and error handling."""
|
||||
|
||||
BASE_URL = "https://api.openalex.org"
|
||||
|
||||
def __init__(self, email: Optional[str] = None, requests_per_second: int = 10):
|
||||
"""
|
||||
Initialize OpenAlex client.
|
||||
|
||||
Args:
|
||||
email: Email for polite pool (10x rate limit boost)
|
||||
requests_per_second: Max requests per second (default: 10 for polite pool)
|
||||
"""
|
||||
self.email = email
|
||||
self.requests_per_second = requests_per_second
|
||||
self.min_delay = 1.0 / requests_per_second
|
||||
self.last_request_time = 0
|
||||
|
||||
def _rate_limit(self):
|
||||
"""Ensure requests don't exceed rate limit."""
|
||||
current_time = time.time()
|
||||
time_since_last = current_time - self.last_request_time
|
||||
if time_since_last < self.min_delay:
|
||||
time.sleep(self.min_delay - time_since_last)
|
||||
self.last_request_time = time.time()
|
||||
|
||||
def _make_request(
|
||||
self,
|
||||
endpoint: str,
|
||||
params: Optional[Dict] = None,
|
||||
max_retries: int = 5
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Make API request with retry logic.
|
||||
|
||||
Args:
|
||||
endpoint: API endpoint (e.g., '/works', '/authors')
|
||||
params: Query parameters
|
||||
max_retries: Maximum number of retry attempts
|
||||
|
||||
Returns:
|
||||
JSON response as dictionary
|
||||
"""
|
||||
if params is None:
|
||||
params = {}
|
||||
|
||||
# Add email to params for polite pool
|
||||
if self.email:
|
||||
params['mailto'] = self.email
|
||||
|
||||
url = urljoin(self.BASE_URL, endpoint)
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
self._rate_limit()
|
||||
response = requests.get(url, params=params, timeout=30)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
elif response.status_code == 403:
|
||||
# Rate limited
|
||||
wait_time = 2 ** attempt
|
||||
print(f"Rate limited. Waiting {wait_time}s before retry...")
|
||||
time.sleep(wait_time)
|
||||
elif response.status_code >= 500:
|
||||
# Server error
|
||||
wait_time = 2 ** attempt
|
||||
print(f"Server error. Waiting {wait_time}s before retry...")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
# Other error - don't retry
|
||||
response.raise_for_status()
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt
|
||||
print(f"Request timeout. Waiting {wait_time}s before retry...")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
raise
|
||||
|
||||
raise Exception(f"Failed after {max_retries} retries")
|
||||
|
||||
def search_works(
|
||||
self,
|
||||
search: Optional[str] = None,
|
||||
filter_params: Optional[Dict] = None,
|
||||
per_page: int = 200,
|
||||
page: int = 1,
|
||||
sort: Optional[str] = None,
|
||||
select: Optional[List[str]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Search works with filters.
|
||||
|
||||
Args:
|
||||
search: Full-text search query
|
||||
filter_params: Dictionary of filter parameters
|
||||
per_page: Results per page (max: 200)
|
||||
page: Page number
|
||||
sort: Sort parameter (e.g., 'cited_by_count:desc')
|
||||
select: List of fields to return
|
||||
|
||||
Returns:
|
||||
API response with meta and results
|
||||
"""
|
||||
params = {
|
||||
'per-page': min(per_page, 200),
|
||||
'page': page
|
||||
}
|
||||
|
||||
if search:
|
||||
params['search'] = search
|
||||
|
||||
if filter_params:
|
||||
filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
|
||||
params['filter'] = filter_str
|
||||
|
||||
if sort:
|
||||
params['sort'] = sort
|
||||
|
||||
if select:
|
||||
params['select'] = ','.join(select)
|
||||
|
||||
return self._make_request('/works', params)
|
||||
|
||||
def get_entity(self, entity_type: str, entity_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get single entity by ID.
|
||||
|
||||
Args:
|
||||
entity_type: Type of entity ('works', 'authors', 'institutions', etc.)
|
||||
entity_id: OpenAlex ID or external ID (DOI, ORCID, etc.)
|
||||
|
||||
Returns:
|
||||
Entity object
|
||||
"""
|
||||
endpoint = f"/{entity_type}/{entity_id}"
|
||||
return self._make_request(endpoint)
|
||||
|
||||
def batch_lookup(
|
||||
self,
|
||||
entity_type: str,
|
||||
ids: List[str],
|
||||
id_field: str = 'openalex_id'
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Look up multiple entities by ID efficiently.
|
||||
|
||||
Args:
|
||||
entity_type: Type of entity ('works', 'authors', etc.)
|
||||
ids: List of IDs (up to 50 per batch)
|
||||
id_field: ID field name ('openalex_id', 'doi', 'orcid', etc.)
|
||||
|
||||
Returns:
|
||||
List of entity objects
|
||||
"""
|
||||
all_results = []
|
||||
|
||||
# Process in batches of 50
|
||||
for i in range(0, len(ids), 50):
|
||||
batch = ids[i:i+50]
|
||||
filter_value = '|'.join(batch)
|
||||
|
||||
params = {
|
||||
'filter': f"{id_field}:{filter_value}",
|
||||
'per-page': 50
|
||||
}
|
||||
|
||||
response = self._make_request(f"/{entity_type}", params)
|
||||
all_results.extend(response.get('results', []))
|
||||
|
||||
return all_results
|
||||
|
||||
def paginate_all(
|
||||
self,
|
||||
endpoint: str,
|
||||
params: Optional[Dict] = None,
|
||||
max_results: Optional[int] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Paginate through all results.
|
||||
|
||||
Args:
|
||||
endpoint: API endpoint
|
||||
params: Query parameters
|
||||
max_results: Maximum number of results to retrieve (None for all)
|
||||
|
||||
Returns:
|
||||
List of all results
|
||||
"""
|
||||
if params is None:
|
||||
params = {}
|
||||
|
||||
params['per-page'] = 200 # Use maximum page size
|
||||
params['page'] = 1
|
||||
|
||||
all_results = []
|
||||
|
||||
while True:
|
||||
response = self._make_request(endpoint, params)
|
||||
results = response.get('results', [])
|
||||
all_results.extend(results)
|
||||
|
||||
# Check if we've hit max_results
|
||||
if max_results and len(all_results) >= max_results:
|
||||
return all_results[:max_results]
|
||||
|
||||
# Check if there are more pages
|
||||
meta = response.get('meta', {})
|
||||
total_count = meta.get('count', 0)
|
||||
current_count = len(all_results)
|
||||
|
||||
if current_count >= total_count:
|
||||
break
|
||||
|
||||
params['page'] += 1
|
||||
|
||||
return all_results
|
||||
|
||||
def sample_works(
|
||||
self,
|
||||
sample_size: int,
|
||||
seed: Optional[int] = None,
|
||||
filter_params: Optional[Dict] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get random sample of works.
|
||||
|
||||
Args:
|
||||
sample_size: Number of samples to retrieve
|
||||
seed: Random seed for reproducibility
|
||||
filter_params: Optional filters to apply
|
||||
|
||||
Returns:
|
||||
List of sampled works
|
||||
"""
|
||||
params = {
|
||||
'sample': min(sample_size, 10000), # API limit per request
|
||||
'per-page': 200
|
||||
}
|
||||
|
||||
if seed is not None:
|
||||
params['seed'] = seed
|
||||
|
||||
if filter_params:
|
||||
filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
|
||||
params['filter'] = filter_str
|
||||
|
||||
# For large samples, need multiple requests with different seeds
|
||||
if sample_size > 10000:
|
||||
all_samples = []
|
||||
seen_ids = set()
|
||||
|
||||
for i in range((sample_size // 10000) + 1):
|
||||
current_seed = seed + i if seed else i
|
||||
params['seed'] = current_seed
|
||||
params['sample'] = min(10000, sample_size - len(all_samples))
|
||||
|
||||
response = self._make_request('/works', params)
|
||||
results = response.get('results', [])
|
||||
|
||||
# Deduplicate
|
||||
for result in results:
|
||||
work_id = result.get('id')
|
||||
if work_id not in seen_ids:
|
||||
seen_ids.add(work_id)
|
||||
all_samples.append(result)
|
||||
|
||||
if len(all_samples) >= sample_size:
|
||||
break
|
||||
|
||||
return all_samples[:sample_size]
|
||||
else:
|
||||
response = self._make_request('/works', params)
|
||||
return response.get('results', [])
|
||||
|
||||
def group_by(
|
||||
self,
|
||||
entity_type: str,
|
||||
group_field: str,
|
||||
filter_params: Optional[Dict] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Aggregate results by field.
|
||||
|
||||
Args:
|
||||
entity_type: Type of entity ('works', 'authors', etc.)
|
||||
group_field: Field to group by
|
||||
filter_params: Optional filters
|
||||
|
||||
Returns:
|
||||
List of grouped results with counts
|
||||
"""
|
||||
params = {
|
||||
'group_by': group_field
|
||||
}
|
||||
|
||||
if filter_params:
|
||||
filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
|
||||
params['filter'] = filter_str
|
||||
|
||||
response = self._make_request(f"/{entity_type}", params)
|
||||
return response.get('group_by', [])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
client = OpenAlexClient(email="your-email@example.com")
|
||||
|
||||
# Search for works about machine learning
|
||||
results = client.search_works(
|
||||
search="machine learning",
|
||||
filter_params={"publication_year": "2023"},
|
||||
per_page=10
|
||||
)
|
||||
|
||||
print(f"Found {results['meta']['count']} works")
|
||||
for work in results['results']:
|
||||
print(f"- {work['title']}")
|
||||
306
skills/openalex-database/scripts/query_helpers.py
Normal file
306
skills/openalex-database/scripts/query_helpers.py
Normal file
@@ -0,0 +1,306 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Helper functions for common OpenAlex query patterns.
|
||||
|
||||
Provides high-level functions for typical research queries.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Optional, Any
|
||||
from openalex_client import OpenAlexClient
|
||||
|
||||
|
||||
def find_author_works(
|
||||
author_name: str,
|
||||
client: OpenAlexClient,
|
||||
limit: Optional[int] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Find all works by an author (two-step pattern).
|
||||
|
||||
Args:
|
||||
author_name: Author name to search for
|
||||
client: OpenAlexClient instance
|
||||
limit: Maximum number of works to return
|
||||
|
||||
Returns:
|
||||
List of works by the author
|
||||
"""
|
||||
# Step 1: Find author ID
|
||||
author_response = client._make_request(
|
||||
'/authors',
|
||||
params={'search': author_name, 'per-page': 1}
|
||||
)
|
||||
|
||||
if not author_response.get('results'):
|
||||
print(f"No author found for: {author_name}")
|
||||
return []
|
||||
|
||||
author = author_response['results'][0]
|
||||
author_id = author['id'].split('/')[-1] # Extract ID from URL
|
||||
|
||||
print(f"Found author: {author['display_name']} (ID: {author_id})")
|
||||
|
||||
# Step 2: Get works by author
|
||||
works_params = {
|
||||
'filter': f'authorships.author.id:{author_id}',
|
||||
'per-page': 200
|
||||
}
|
||||
|
||||
if limit and limit <= 200:
|
||||
works_params['per-page'] = limit
|
||||
response = client._make_request('/works', works_params)
|
||||
return response.get('results', [])
|
||||
else:
|
||||
# Need pagination
|
||||
return client.paginate_all('/works', works_params, max_results=limit)
|
||||
|
||||
|
||||
def find_institution_works(
|
||||
institution_name: str,
|
||||
client: OpenAlexClient,
|
||||
limit: Optional[int] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Find all works from an institution (two-step pattern).
|
||||
|
||||
Args:
|
||||
institution_name: Institution name to search for
|
||||
client: OpenAlexClient instance
|
||||
limit: Maximum number of works to return
|
||||
|
||||
Returns:
|
||||
List of works from the institution
|
||||
"""
|
||||
# Step 1: Find institution ID
|
||||
inst_response = client._make_request(
|
||||
'/institutions',
|
||||
params={'search': institution_name, 'per-page': 1}
|
||||
)
|
||||
|
||||
if not inst_response.get('results'):
|
||||
print(f"No institution found for: {institution_name}")
|
||||
return []
|
||||
|
||||
institution = inst_response['results'][0]
|
||||
inst_id = institution['id'].split('/')[-1] # Extract ID from URL
|
||||
|
||||
print(f"Found institution: {institution['display_name']} (ID: {inst_id})")
|
||||
|
||||
# Step 2: Get works from institution
|
||||
works_params = {
|
||||
'filter': f'authorships.institutions.id:{inst_id}',
|
||||
'per-page': 200
|
||||
}
|
||||
|
||||
if limit and limit <= 200:
|
||||
works_params['per-page'] = limit
|
||||
response = client._make_request('/works', works_params)
|
||||
return response.get('results', [])
|
||||
else:
|
||||
return client.paginate_all('/works', works_params, max_results=limit)
|
||||
|
||||
|
||||
def find_highly_cited_recent_papers(
|
||||
topic: Optional[str] = None,
|
||||
years: str = ">2020",
|
||||
client: Optional[OpenAlexClient] = None,
|
||||
limit: int = 100
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Find highly cited recent papers, optionally filtered by topic.
|
||||
|
||||
Args:
|
||||
topic: Optional search term for topic filtering
|
||||
years: Year filter (e.g., ">2020", "2020-2023")
|
||||
client: OpenAlexClient instance
|
||||
limit: Maximum number of papers to return
|
||||
|
||||
Returns:
|
||||
List of highly cited papers sorted by citation count
|
||||
"""
|
||||
if client is None:
|
||||
client = OpenAlexClient()
|
||||
|
||||
params = {
|
||||
'filter': f'publication_year:{years}',
|
||||
'sort': 'cited_by_count:desc',
|
||||
'per-page': min(limit, 200)
|
||||
}
|
||||
|
||||
if topic:
|
||||
params['search'] = topic
|
||||
|
||||
if limit <= 200:
|
||||
response = client._make_request('/works', params)
|
||||
return response.get('results', [])
|
||||
else:
|
||||
return client.paginate_all('/works', params, max_results=limit)
|
||||
|
||||
|
||||
def get_open_access_papers(
|
||||
search_term: str,
|
||||
client: OpenAlexClient,
|
||||
oa_status: str = "any", # "any", "gold", "green", "hybrid", "bronze"
|
||||
limit: int = 100
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Find open access papers on a topic.
|
||||
|
||||
Args:
|
||||
search_term: Search query
|
||||
client: OpenAlexClient instance
|
||||
oa_status: Type of OA ("any" for is_oa:true, or specific status)
|
||||
limit: Maximum number of papers to return
|
||||
|
||||
Returns:
|
||||
List of open access papers
|
||||
"""
|
||||
if oa_status == "any":
|
||||
filter_str = "is_oa:true"
|
||||
else:
|
||||
filter_str = f"open_access.oa_status:{oa_status}"
|
||||
|
||||
params = {
|
||||
'search': search_term,
|
||||
'filter': filter_str,
|
||||
'per-page': min(limit, 200)
|
||||
}
|
||||
|
||||
if limit <= 200:
|
||||
response = client._make_request('/works', params)
|
||||
return response.get('results', [])
|
||||
else:
|
||||
return client.paginate_all('/works', params, max_results=limit)
|
||||
|
||||
|
||||
def get_publication_trends(
|
||||
search_term: Optional[str] = None,
|
||||
filter_params: Optional[Dict] = None,
|
||||
client: Optional[OpenAlexClient] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get publication counts by year.
|
||||
|
||||
Args:
|
||||
search_term: Optional search query
|
||||
filter_params: Optional additional filters
|
||||
client: OpenAlexClient instance
|
||||
|
||||
Returns:
|
||||
List of {year, count} dictionaries
|
||||
"""
|
||||
if client is None:
|
||||
client = OpenAlexClient()
|
||||
|
||||
params = {'group_by': 'publication_year'}
|
||||
|
||||
if search_term:
|
||||
params['search'] = search_term
|
||||
|
||||
if filter_params:
|
||||
filter_str = ','.join([f"{k}:{v}" for k, v in filter_params.items()])
|
||||
params['filter'] = filter_str
|
||||
|
||||
response = client._make_request('/works', params)
|
||||
return response.get('group_by', [])
|
||||
|
||||
|
||||
def analyze_research_output(
|
||||
entity_type: str, # 'author' or 'institution'
|
||||
entity_name: str,
|
||||
client: OpenAlexClient,
|
||||
years: str = ">2020"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze research output for an author or institution.
|
||||
|
||||
Args:
|
||||
entity_type: 'author' or 'institution'
|
||||
entity_name: Name to search for
|
||||
client: OpenAlexClient instance
|
||||
years: Year filter
|
||||
|
||||
Returns:
|
||||
Dictionary with analysis results
|
||||
"""
|
||||
# Find entity ID
|
||||
if entity_type == 'author':
|
||||
endpoint = '/authors'
|
||||
filter_prefix = 'authorships.author.id'
|
||||
else:
|
||||
endpoint = '/institutions'
|
||||
filter_prefix = 'authorships.institutions.id'
|
||||
|
||||
# Step 1: Find entity
|
||||
entity_response = client._make_request(
|
||||
endpoint,
|
||||
params={'search': entity_name, 'per-page': 1}
|
||||
)
|
||||
|
||||
if not entity_response.get('results'):
|
||||
return {'error': f'No {entity_type} found for: {entity_name}'}
|
||||
|
||||
entity = entity_response['results'][0]
|
||||
entity_id = entity['id'].split('/')[-1]
|
||||
|
||||
# Step 2: Get statistics
|
||||
filter_params = {
|
||||
filter_prefix: entity_id,
|
||||
'publication_year': years
|
||||
}
|
||||
|
||||
# Total works
|
||||
works_response = client.search_works(
|
||||
filter_params=filter_params,
|
||||
per_page=1
|
||||
)
|
||||
total_works = works_response['meta']['count']
|
||||
|
||||
# Works by year
|
||||
trends = client.group_by(
|
||||
'works',
|
||||
'publication_year',
|
||||
filter_params={filter_prefix: entity_id, 'publication_year': years}
|
||||
)
|
||||
|
||||
# Top topics
|
||||
topics = client.group_by(
|
||||
'works',
|
||||
'topics.id',
|
||||
filter_params=filter_params
|
||||
)
|
||||
|
||||
# OA percentage
|
||||
oa_works = client.search_works(
|
||||
filter_params={**filter_params, 'is_oa': 'true'},
|
||||
per_page=1
|
||||
)
|
||||
oa_count = oa_works['meta']['count']
|
||||
oa_percentage = (oa_count / total_works * 100) if total_works > 0 else 0
|
||||
|
||||
return {
|
||||
'entity_name': entity['display_name'],
|
||||
'entity_id': entity_id,
|
||||
'total_works': total_works,
|
||||
'open_access_works': oa_count,
|
||||
'open_access_percentage': round(oa_percentage, 1),
|
||||
'publications_by_year': trends[:10], # Last 10 years
|
||||
'top_topics': topics[:10] # Top 10 topics
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
import json
|
||||
|
||||
client = OpenAlexClient(email="your-email@example.com")
|
||||
|
||||
# Find works by author
|
||||
print("\n=== Finding works by author ===")
|
||||
works = find_author_works("Einstein", client, limit=5)
|
||||
print(f"Found {len(works)} works")
|
||||
|
||||
# Analyze research output
|
||||
print("\n=== Analyzing institution research output ===")
|
||||
analysis = analyze_research_output('institution', 'MIT', client)
|
||||
print(json.dumps(analysis, indent=2))
|
||||
Reference in New Issue
Block a user