Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,381 @@
# Common OpenAlex Query Examples
This document provides practical examples for common research queries using OpenAlex.
## Finding Papers by Author
**User query**: "Find papers by Albert Einstein"
**Approach**: Two-step pattern
1. Search for author to get ID
2. Filter works by author ID
**Python example**:
```python
from scripts.openalex_client import OpenAlexClient
from scripts.query_helpers import find_author_works
client = OpenAlexClient(email="your-email@example.edu")
works = find_author_works("Albert Einstein", client, limit=100)
for work in works:
print(f"{work['title']} ({work['publication_year']})")
```
## Finding Papers from an Institution
**User query**: "What papers has MIT published in the last year?"
**Approach**: Two-step pattern with date filter
1. Search for institution to get ID
2. Filter works by institution ID and year
**Python example**:
```python
from scripts.query_helpers import find_institution_works
works = find_institution_works("MIT", client, limit=200)
# Filter for recent papers
import datetime
current_year = datetime.datetime.now().year
recent_works = [w for w in works if w['publication_year'] == current_year]
```
## Highly Cited Papers on a Topic
**User query**: "Find the most cited papers on CRISPR from the last 5 years"
**Approach**: Search + filter + sort
**Python example**:
```python
works = client.search_works(
search="CRISPR",
filter_params={
"publication_year": ">2019"
},
sort="cited_by_count:desc",
per_page=100
)
for work in works['results']:
title = work['title']
citations = work['cited_by_count']
year = work['publication_year']
print(f"{title} ({year}): {citations} citations")
```
## Open Access Papers on a Topic
**User query**: "Find open access papers about climate change"
**Approach**: Search + OA filter
**Python example**:
```python
from scripts.query_helpers import get_open_access_papers
papers = get_open_access_papers(
search_term="climate change",
client=client,
oa_status="any", # or "gold", "green", "hybrid", "bronze"
limit=200
)
for paper in papers:
print(f"{paper['title']}")
print(f" OA Status: {paper['open_access']['oa_status']}")
print(f" URL: {paper['open_access']['oa_url']}")
```
## Publication Trends Analysis
**User query**: "Show me publication trends for machine learning over the years"
**Approach**: Use group_by to aggregate by year
**Python example**:
```python
from scripts.query_helpers import get_publication_trends
trends = get_publication_trends(
search_term="machine learning",
client=client
)
# Sort by year
trends_sorted = sorted(trends, key=lambda x: x['key'])
for trend in trends_sorted[-10:]: # Last 10 years
year = trend['key']
count = trend['count']
print(f"{year}: {count} publications")
```
## Analyzing Research Output
**User query**: "Analyze the research output of Stanford University from 2020-2024"
**Approach**: Multiple aggregations for comprehensive analysis
**Python example**:
```python
from scripts.query_helpers import analyze_research_output
analysis = analyze_research_output(
entity_type='institution',
entity_name='Stanford University',
client=client,
years='2020-2024'
)
print(f"Institution: {analysis['entity_name']}")
print(f"Total works: {analysis['total_works']}")
print(f"Open access: {analysis['open_access_percentage']}%")
print("\nTop topics:")
for topic in analysis['top_topics'][:5]:
print(f" - {topic['key_display_name']}: {topic['count']} works")
```
## Finding Papers by DOI (Batch)
**User query**: "Get information for these 10 DOIs: ..."
**Approach**: Batch lookup with pipe separator
**Python example**:
```python
dois = [
"https://doi.org/10.1371/journal.pone.0266781",
"https://doi.org/10.1371/journal.pone.0267149",
"https://doi.org/10.1038/s41586-021-03819-2",
# ... up to 50 DOIs
]
works = client.batch_lookup(
entity_type='works',
ids=dois,
id_field='doi'
)
for work in works:
print(f"{work['title']} - {work['publication_year']}")
```
## Random Sample of Papers
**User query**: "Give me 50 random papers from 2023"
**Approach**: Use sample parameter with seed for reproducibility
**Python example**:
```python
works = client.sample_works(
sample_size=50,
seed=42, # For reproducibility
filter_params={
"publication_year": "2023",
"is_oa": "true"
}
)
print(f"Got {len(works)} random papers from 2023")
```
## Papers from Multiple Institutions
**User query**: "Find papers with authors from both MIT and Stanford"
**Approach**: Use + operator for AND within same attribute
**Python example**:
```python
# First, get institution IDs
mit_response = client._make_request(
'/institutions',
params={'search': 'MIT', 'per-page': 1}
)
mit_id = mit_response['results'][0]['id'].split('/')[-1]
stanford_response = client._make_request(
'/institutions',
params={'search': 'Stanford', 'per-page': 1}
)
stanford_id = stanford_response['results'][0]['id'].split('/')[-1]
# Find works with authors from both institutions
works = client.search_works(
filter_params={
"authorships.institutions.id": f"{mit_id}+{stanford_id}"
},
per_page=100
)
print(f"Found {works['meta']['count']} collaborative papers")
```
## Papers in a Specific Journal
**User query**: "Get all papers from Nature published in 2023"
**Approach**: Two-step - find journal ID, then filter works
**Python example**:
```python
# Step 1: Find journal source ID
source_response = client._make_request(
'/sources',
params={'search': 'Nature', 'per-page': 1}
)
source = source_response['results'][0]
source_id = source['id'].split('/')[-1]
print(f"Found journal: {source['display_name']} (ID: {source_id})")
# Step 2: Get works from that source
works = client.search_works(
filter_params={
"primary_location.source.id": source_id,
"publication_year": "2023"
},
per_page=200
)
print(f"Found {works['meta']['count']} papers from Nature in 2023")
```
## Topic Analysis by Institution
**User query**: "What topics does MIT research most?"
**Approach**: Filter by institution, group by topics
**Python example**:
```python
# Get MIT ID
inst_response = client._make_request(
'/institutions',
params={'search': 'MIT', 'per-page': 1}
)
mit_id = inst_response['results'][0]['id'].split('/')[-1]
# Group by topics
topics = client.group_by(
entity_type='works',
group_field='topics.id',
filter_params={
"authorships.institutions.id": mit_id,
"publication_year": ">2020"
}
)
print("Top research topics at MIT (2020+):")
for i, topic in enumerate(topics[:10], 1):
print(f"{i}. {topic['key_display_name']}: {topic['count']} works")
```
## Citation Analysis
**User query**: "Find papers that cite this specific DOI"
**Approach**: Get work by DOI, then use cited_by_api_url
**Python example**:
```python
# Get the work
doi = "https://doi.org/10.1038/s41586-021-03819-2"
work = client.get_entity('works', doi)
# Get papers that cite it
cited_by_url = work['cited_by_api_url']
# Extract just the query part and use it
import requests
response = requests.get(cited_by_url, params={'mailto': client.email})
citing_works = response.json()
print(f"{work['title']}")
print(f"Total citations: {work['cited_by_count']}")
print(f"\nRecent citing papers:")
for citing_work in citing_works['results'][:5]:
print(f" - {citing_work['title']} ({citing_work['publication_year']})")
```
## Large-Scale Data Extraction
**User query**: "Get all papers on quantum computing from the last 3 years"
**Approach**: Paginate through all results
**Python example**:
```python
all_papers = client.paginate_all(
endpoint='/works',
params={
'search': 'quantum computing',
'filter': 'publication_year:2022-2024'
},
max_results=10000 # Limit to prevent excessive API calls
)
print(f"Retrieved {len(all_papers)} papers")
# Save to CSV
import csv
with open('quantum_papers.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Title', 'Year', 'Citations', 'DOI', 'OA Status'])
for paper in all_papers:
writer.writerow([
paper['title'],
paper['publication_year'],
paper['cited_by_count'],
paper.get('doi', 'N/A'),
paper['open_access']['oa_status']
])
```
## Complex Multi-Filter Query
**User query**: "Find recent, highly-cited, open access papers on AI from top institutions"
**Approach**: Combine multiple filters
**Python example**:
```python
# Get IDs for top institutions
top_institutions = ['MIT', 'Stanford', 'Oxford']
inst_ids = []
for inst_name in top_institutions:
response = client._make_request(
'/institutions',
params={'search': inst_name, 'per-page': 1}
)
if response['results']:
inst_id = response['results'][0]['id'].split('/')[-1]
inst_ids.append(inst_id)
# Combine with pipe for OR
inst_filter = '|'.join(inst_ids)
# Complex query
works = client.search_works(
search="artificial intelligence",
filter_params={
"publication_year": ">2022",
"cited_by_count": ">50",
"is_oa": "true",
"authorships.institutions.id": inst_filter
},
sort="cited_by_count:desc",
per_page=200
)
print(f"Found {works['meta']['count']} papers matching criteria")
for work in works['results'][:10]:
print(f"{work['title']}")
print(f" Citations: {work['cited_by_count']}, Year: {work['publication_year']}")
```