Initial commit
This commit is contained in:
204
skills/citation-management/scripts/doi_to_bibtex.py
Normal file
204
skills/citation-management/scripts/doi_to_bibtex.py
Normal file
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DOI to BibTeX Converter
|
||||
Quick utility to convert DOIs to BibTeX format using CrossRef API.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import requests
|
||||
import argparse
|
||||
import time
|
||||
import json
|
||||
from typing import Optional, List
|
||||
|
||||
class DOIConverter:
|
||||
"""Convert DOIs to BibTeX entries using CrossRef API."""
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'DOIConverter/1.0 (Citation Management Tool; mailto:support@example.com)'
|
||||
})
|
||||
|
||||
def doi_to_bibtex(self, doi: str) -> Optional[str]:
|
||||
"""
|
||||
Convert a single DOI to BibTeX format.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
BibTeX string or None if conversion fails
|
||||
"""
|
||||
# Clean DOI (remove URL prefix if present)
|
||||
doi = doi.strip()
|
||||
if doi.startswith('https://doi.org/'):
|
||||
doi = doi.replace('https://doi.org/', '')
|
||||
elif doi.startswith('http://doi.org/'):
|
||||
doi = doi.replace('http://doi.org/', '')
|
||||
elif doi.startswith('doi:'):
|
||||
doi = doi.replace('doi:', '')
|
||||
|
||||
# Request BibTeX from CrossRef content negotiation
|
||||
url = f'https://doi.org/{doi}'
|
||||
headers = {
|
||||
'Accept': 'application/x-bibtex',
|
||||
'User-Agent': 'DOIConverter/1.0 (Citation Management Tool)'
|
||||
}
|
||||
|
||||
try:
|
||||
response = self.session.get(url, headers=headers, timeout=15)
|
||||
|
||||
if response.status_code == 200:
|
||||
bibtex = response.text.strip()
|
||||
# CrossRef sometimes returns entries with @data type, convert to @misc
|
||||
if bibtex.startswith('@data{'):
|
||||
bibtex = bibtex.replace('@data{', '@misc{', 1)
|
||||
return bibtex
|
||||
elif response.status_code == 404:
|
||||
print(f'Error: DOI not found: {doi}', file=sys.stderr)
|
||||
return None
|
||||
else:
|
||||
print(f'Error: Failed to retrieve BibTeX for {doi} (status {response.status_code})', file=sys.stderr)
|
||||
return None
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print(f'Error: Request timeout for DOI: {doi}', file=sys.stderr)
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f'Error: Request failed for {doi}: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def convert_multiple(self, dois: List[str], delay: float = 0.5) -> List[str]:
|
||||
"""
|
||||
Convert multiple DOIs to BibTeX.
|
||||
|
||||
Args:
|
||||
dois: List of DOIs
|
||||
delay: Delay between requests (seconds) for rate limiting
|
||||
|
||||
Returns:
|
||||
List of BibTeX entries (excludes failed conversions)
|
||||
"""
|
||||
bibtex_entries = []
|
||||
|
||||
for i, doi in enumerate(dois):
|
||||
print(f'Converting DOI {i+1}/{len(dois)}: {doi}', file=sys.stderr)
|
||||
bibtex = self.doi_to_bibtex(doi)
|
||||
|
||||
if bibtex:
|
||||
bibtex_entries.append(bibtex)
|
||||
|
||||
# Rate limiting
|
||||
if i < len(dois) - 1: # Don't delay after last request
|
||||
time.sleep(delay)
|
||||
|
||||
return bibtex_entries
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert DOIs to BibTeX format using CrossRef API',
|
||||
epilog='Example: python doi_to_bibtex.py 10.1038/s41586-021-03819-2'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'dois',
|
||||
nargs='*',
|
||||
help='DOI(s) to convert (can provide multiple)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-i', '--input',
|
||||
help='Input file with DOIs (one per line)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file for BibTeX (default: stdout)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--delay',
|
||||
type=float,
|
||||
default=0.5,
|
||||
help='Delay between requests in seconds (default: 0.5)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['bibtex', 'json'],
|
||||
default='bibtex',
|
||||
help='Output format (default: bibtex)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Collect DOIs from command line and/or file
|
||||
dois = []
|
||||
|
||||
if args.dois:
|
||||
dois.extend(args.dois)
|
||||
|
||||
if args.input:
|
||||
try:
|
||||
with open(args.input, 'r', encoding='utf-8') as f:
|
||||
file_dois = [line.strip() for line in f if line.strip()]
|
||||
dois.extend(file_dois)
|
||||
except FileNotFoundError:
|
||||
print(f'Error: Input file not found: {args.input}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f'Error reading input file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not dois:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Convert DOIs
|
||||
converter = DOIConverter()
|
||||
|
||||
if len(dois) == 1:
|
||||
bibtex = converter.doi_to_bibtex(dois[0])
|
||||
if bibtex:
|
||||
bibtex_entries = [bibtex]
|
||||
else:
|
||||
sys.exit(1)
|
||||
else:
|
||||
bibtex_entries = converter.convert_multiple(dois, delay=args.delay)
|
||||
|
||||
if not bibtex_entries:
|
||||
print('Error: No successful conversions', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Format output
|
||||
if args.format == 'bibtex':
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
else: # json
|
||||
output = json.dumps({
|
||||
'count': len(bibtex_entries),
|
||||
'entries': bibtex_entries
|
||||
}, indent=2)
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
try:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'Successfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f'Error writing output file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
# Summary
|
||||
if len(dois) > 1:
|
||||
success_rate = len(bibtex_entries) / len(dois) * 100
|
||||
print(f'\nConverted {len(bibtex_entries)}/{len(dois)} DOIs ({success_rate:.1f}%)', file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
569
skills/citation-management/scripts/extract_metadata.py
Executable file
569
skills/citation-management/scripts/extract_metadata.py
Executable file
@@ -0,0 +1,569 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Metadata Extraction Tool
|
||||
Extract citation metadata from DOI, PMID, arXiv ID, or URL using various APIs.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
import argparse
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Optional, Dict, List, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class MetadataExtractor:
|
||||
"""Extract metadata from various sources and generate BibTeX."""
|
||||
|
||||
def __init__(self, email: Optional[str] = None):
|
||||
"""
|
||||
Initialize extractor.
|
||||
|
||||
Args:
|
||||
email: Email for Entrez API (recommended for PubMed)
|
||||
"""
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'MetadataExtractor/1.0 (Citation Management Tool)'
|
||||
})
|
||||
self.email = email or os.getenv('NCBI_EMAIL', '')
|
||||
|
||||
def identify_type(self, identifier: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Identify the type of identifier.
|
||||
|
||||
Args:
|
||||
identifier: DOI, PMID, arXiv ID, or URL
|
||||
|
||||
Returns:
|
||||
Tuple of (type, cleaned_identifier)
|
||||
"""
|
||||
identifier = identifier.strip()
|
||||
|
||||
# Check if URL
|
||||
if identifier.startswith('http://') or identifier.startswith('https://'):
|
||||
return self._parse_url(identifier)
|
||||
|
||||
# Check for DOI
|
||||
if identifier.startswith('10.'):
|
||||
return ('doi', identifier)
|
||||
|
||||
# Check for arXiv ID
|
||||
if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', identifier):
|
||||
return ('arxiv', identifier)
|
||||
if identifier.startswith('arXiv:'):
|
||||
return ('arxiv', identifier.replace('arXiv:', ''))
|
||||
|
||||
# Check for PMID (8-digit number typically)
|
||||
if identifier.isdigit() and len(identifier) >= 7:
|
||||
return ('pmid', identifier)
|
||||
|
||||
# Check for PMCID
|
||||
if identifier.upper().startswith('PMC') and identifier[3:].isdigit():
|
||||
return ('pmcid', identifier.upper())
|
||||
|
||||
return ('unknown', identifier)
|
||||
|
||||
def _parse_url(self, url: str) -> Tuple[str, str]:
|
||||
"""Parse URL to extract identifier type and value."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
# DOI URLs
|
||||
if 'doi.org' in parsed.netloc:
|
||||
doi = parsed.path.lstrip('/')
|
||||
return ('doi', doi)
|
||||
|
||||
# PubMed URLs
|
||||
if 'pubmed.ncbi.nlm.nih.gov' in parsed.netloc or 'ncbi.nlm.nih.gov/pubmed' in url:
|
||||
pmid = re.search(r'/(\d+)', parsed.path)
|
||||
if pmid:
|
||||
return ('pmid', pmid.group(1))
|
||||
|
||||
# arXiv URLs
|
||||
if 'arxiv.org' in parsed.netloc:
|
||||
arxiv_id = re.search(r'/abs/(\d{4}\.\d{4,5})', parsed.path)
|
||||
if arxiv_id:
|
||||
return ('arxiv', arxiv_id.group(1))
|
||||
|
||||
# Nature, Science, Cell, etc. - try to extract DOI from URL
|
||||
doi_match = re.search(r'10\.\d{4,}/[^\s/]+', url)
|
||||
if doi_match:
|
||||
return ('doi', doi_match.group())
|
||||
|
||||
return ('url', url)
|
||||
|
||||
def extract_from_doi(self, doi: str) -> Optional[Dict]:
|
||||
"""
|
||||
Extract metadata from DOI using CrossRef API.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
Metadata dictionary or None
|
||||
"""
|
||||
url = f'https://api.crossref.org/works/{doi}'
|
||||
|
||||
try:
|
||||
response = self.session.get(url, timeout=15)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
message = data.get('message', {})
|
||||
|
||||
metadata = {
|
||||
'type': 'doi',
|
||||
'entry_type': self._crossref_type_to_bibtex(message.get('type')),
|
||||
'doi': doi,
|
||||
'title': message.get('title', [''])[0],
|
||||
'authors': self._format_authors_crossref(message.get('author', [])),
|
||||
'year': self._extract_year_crossref(message),
|
||||
'journal': message.get('container-title', [''])[0] if message.get('container-title') else '',
|
||||
'volume': str(message.get('volume', '')) if message.get('volume') else '',
|
||||
'issue': str(message.get('issue', '')) if message.get('issue') else '',
|
||||
'pages': message.get('page', ''),
|
||||
'publisher': message.get('publisher', ''),
|
||||
'url': f'https://doi.org/{doi}'
|
||||
}
|
||||
|
||||
return metadata
|
||||
else:
|
||||
print(f'Error: CrossRef API returned status {response.status_code} for DOI: {doi}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata from DOI {doi}: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def extract_from_pmid(self, pmid: str) -> Optional[Dict]:
|
||||
"""
|
||||
Extract metadata from PMID using PubMed E-utilities.
|
||||
|
||||
Args:
|
||||
pmid: PubMed ID
|
||||
|
||||
Returns:
|
||||
Metadata dictionary or None
|
||||
"""
|
||||
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
|
||||
params = {
|
||||
'db': 'pubmed',
|
||||
'id': pmid,
|
||||
'retmode': 'xml',
|
||||
'rettype': 'abstract'
|
||||
}
|
||||
|
||||
if self.email:
|
||||
params['email'] = self.email
|
||||
|
||||
api_key = os.getenv('NCBI_API_KEY')
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(url, params=params, timeout=15)
|
||||
|
||||
if response.status_code == 200:
|
||||
root = ET.fromstring(response.content)
|
||||
article = root.find('.//PubmedArticle')
|
||||
|
||||
if article is None:
|
||||
print(f'Error: No article found for PMID: {pmid}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Extract metadata from XML
|
||||
medline_citation = article.find('.//MedlineCitation')
|
||||
article_elem = medline_citation.find('.//Article')
|
||||
journal = article_elem.find('.//Journal')
|
||||
|
||||
# Get DOI if available
|
||||
doi = None
|
||||
article_ids = article.findall('.//ArticleId')
|
||||
for article_id in article_ids:
|
||||
if article_id.get('IdType') == 'doi':
|
||||
doi = article_id.text
|
||||
break
|
||||
|
||||
metadata = {
|
||||
'type': 'pmid',
|
||||
'entry_type': 'article',
|
||||
'pmid': pmid,
|
||||
'title': article_elem.findtext('.//ArticleTitle', ''),
|
||||
'authors': self._format_authors_pubmed(article_elem.findall('.//Author')),
|
||||
'year': self._extract_year_pubmed(article_elem),
|
||||
'journal': journal.findtext('.//Title', ''),
|
||||
'volume': journal.findtext('.//JournalIssue/Volume', ''),
|
||||
'issue': journal.findtext('.//JournalIssue/Issue', ''),
|
||||
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
|
||||
'doi': doi
|
||||
}
|
||||
|
||||
return metadata
|
||||
else:
|
||||
print(f'Error: PubMed API returned status {response.status_code} for PMID: {pmid}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata from PMID {pmid}: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def extract_from_arxiv(self, arxiv_id: str) -> Optional[Dict]:
|
||||
"""
|
||||
Extract metadata from arXiv ID using arXiv API.
|
||||
|
||||
Args:
|
||||
arxiv_id: arXiv identifier
|
||||
|
||||
Returns:
|
||||
Metadata dictionary or None
|
||||
"""
|
||||
url = 'http://export.arxiv.org/api/query'
|
||||
params = {
|
||||
'id_list': arxiv_id,
|
||||
'max_results': 1
|
||||
}
|
||||
|
||||
try:
|
||||
response = self.session.get(url, params=params, timeout=15)
|
||||
|
||||
if response.status_code == 200:
|
||||
# Parse Atom XML
|
||||
root = ET.fromstring(response.content)
|
||||
ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
|
||||
|
||||
entry = root.find('atom:entry', ns)
|
||||
if entry is None:
|
||||
print(f'Error: No entry found for arXiv ID: {arxiv_id}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Extract DOI if published
|
||||
doi_elem = entry.find('arxiv:doi', ns)
|
||||
doi = doi_elem.text if doi_elem is not None else None
|
||||
|
||||
# Extract journal reference if published
|
||||
journal_ref_elem = entry.find('arxiv:journal_ref', ns)
|
||||
journal_ref = journal_ref_elem.text if journal_ref_elem is not None else None
|
||||
|
||||
# Get publication date
|
||||
published = entry.findtext('atom:published', '', ns)
|
||||
year = published[:4] if published else ''
|
||||
|
||||
# Get authors
|
||||
authors = []
|
||||
for author in entry.findall('atom:author', ns):
|
||||
name = author.findtext('atom:name', '', ns)
|
||||
if name:
|
||||
authors.append(name)
|
||||
|
||||
metadata = {
|
||||
'type': 'arxiv',
|
||||
'entry_type': 'misc' if not doi else 'article',
|
||||
'arxiv_id': arxiv_id,
|
||||
'title': entry.findtext('atom:title', '', ns).strip().replace('\n', ' '),
|
||||
'authors': ' and '.join(authors),
|
||||
'year': year,
|
||||
'doi': doi,
|
||||
'journal_ref': journal_ref,
|
||||
'abstract': entry.findtext('atom:summary', '', ns).strip().replace('\n', ' '),
|
||||
'url': f'https://arxiv.org/abs/{arxiv_id}'
|
||||
}
|
||||
|
||||
return metadata
|
||||
else:
|
||||
print(f'Error: arXiv API returned status {response.status_code} for ID: {arxiv_id}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata from arXiv {arxiv_id}: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def metadata_to_bibtex(self, metadata: Dict, citation_key: Optional[str] = None) -> str:
|
||||
"""
|
||||
Convert metadata dictionary to BibTeX format.
|
||||
|
||||
Args:
|
||||
metadata: Metadata dictionary
|
||||
citation_key: Optional custom citation key
|
||||
|
||||
Returns:
|
||||
BibTeX string
|
||||
"""
|
||||
if not citation_key:
|
||||
citation_key = self._generate_citation_key(metadata)
|
||||
|
||||
entry_type = metadata.get('entry_type', 'misc')
|
||||
|
||||
# Build BibTeX entry
|
||||
lines = [f'@{entry_type}{{{citation_key},']
|
||||
|
||||
# Add fields
|
||||
if metadata.get('authors'):
|
||||
lines.append(f' author = {{{metadata["authors"]}}},')
|
||||
|
||||
if metadata.get('title'):
|
||||
# Protect capitalization
|
||||
title = self._protect_title(metadata['title'])
|
||||
lines.append(f' title = {{{title}}},')
|
||||
|
||||
if entry_type == 'article' and metadata.get('journal'):
|
||||
lines.append(f' journal = {{{metadata["journal"]}}},')
|
||||
elif entry_type == 'misc' and metadata.get('type') == 'arxiv':
|
||||
lines.append(f' howpublished = {{arXiv}},')
|
||||
|
||||
if metadata.get('year'):
|
||||
lines.append(f' year = {{{metadata["year"]}}},')
|
||||
|
||||
if metadata.get('volume'):
|
||||
lines.append(f' volume = {{{metadata["volume"]}}},')
|
||||
|
||||
if metadata.get('issue'):
|
||||
lines.append(f' number = {{{metadata["issue"]}}},')
|
||||
|
||||
if metadata.get('pages'):
|
||||
pages = metadata['pages'].replace('-', '--') # En-dash
|
||||
lines.append(f' pages = {{{pages}}},')
|
||||
|
||||
if metadata.get('doi'):
|
||||
lines.append(f' doi = {{{metadata["doi"]}}},')
|
||||
elif metadata.get('url'):
|
||||
lines.append(f' url = {{{metadata["url"]}}},')
|
||||
|
||||
if metadata.get('pmid'):
|
||||
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
|
||||
|
||||
if metadata.get('type') == 'arxiv' and not metadata.get('doi'):
|
||||
lines.append(f' note = {{Preprint}},')
|
||||
|
||||
# Remove trailing comma from last field
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def _crossref_type_to_bibtex(self, crossref_type: str) -> str:
|
||||
"""Map CrossRef type to BibTeX entry type."""
|
||||
type_map = {
|
||||
'journal-article': 'article',
|
||||
'book': 'book',
|
||||
'book-chapter': 'incollection',
|
||||
'proceedings-article': 'inproceedings',
|
||||
'posted-content': 'misc',
|
||||
'dataset': 'misc',
|
||||
'report': 'techreport'
|
||||
}
|
||||
return type_map.get(crossref_type, 'misc')
|
||||
|
||||
def _format_authors_crossref(self, authors: List[Dict]) -> str:
|
||||
"""Format author list from CrossRef data."""
|
||||
if not authors:
|
||||
return ''
|
||||
|
||||
formatted = []
|
||||
for author in authors:
|
||||
given = author.get('given', '')
|
||||
family = author.get('family', '')
|
||||
if family:
|
||||
if given:
|
||||
formatted.append(f'{family}, {given}')
|
||||
else:
|
||||
formatted.append(family)
|
||||
|
||||
return ' and '.join(formatted)
|
||||
|
||||
def _format_authors_pubmed(self, authors: List) -> str:
|
||||
"""Format author list from PubMed XML."""
|
||||
formatted = []
|
||||
for author in authors:
|
||||
last_name = author.findtext('.//LastName', '')
|
||||
fore_name = author.findtext('.//ForeName', '')
|
||||
if last_name:
|
||||
if fore_name:
|
||||
formatted.append(f'{last_name}, {fore_name}')
|
||||
else:
|
||||
formatted.append(last_name)
|
||||
|
||||
return ' and '.join(formatted)
|
||||
|
||||
def _extract_year_crossref(self, message: Dict) -> str:
|
||||
"""Extract year from CrossRef message."""
|
||||
# Try published-print first, then published-online
|
||||
date_parts = message.get('published-print', {}).get('date-parts', [[]])
|
||||
if not date_parts or not date_parts[0]:
|
||||
date_parts = message.get('published-online', {}).get('date-parts', [[]])
|
||||
|
||||
if date_parts and date_parts[0]:
|
||||
return str(date_parts[0][0])
|
||||
return ''
|
||||
|
||||
def _extract_year_pubmed(self, article: ET.Element) -> str:
|
||||
"""Extract year from PubMed XML."""
|
||||
year = article.findtext('.//Journal/JournalIssue/PubDate/Year', '')
|
||||
if not year:
|
||||
medline_date = article.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
|
||||
if medline_date:
|
||||
year_match = re.search(r'\d{4}', medline_date)
|
||||
if year_match:
|
||||
year = year_match.group()
|
||||
return year
|
||||
|
||||
def _generate_citation_key(self, metadata: Dict) -> str:
|
||||
"""Generate a citation key from metadata."""
|
||||
# Get first author last name
|
||||
authors = metadata.get('authors', '')
|
||||
if authors:
|
||||
first_author = authors.split(' and ')[0]
|
||||
if ',' in first_author:
|
||||
last_name = first_author.split(',')[0].strip()
|
||||
else:
|
||||
last_name = first_author.split()[-1] if first_author else 'Unknown'
|
||||
else:
|
||||
last_name = 'Unknown'
|
||||
|
||||
# Get year
|
||||
year = metadata.get('year', '').strip()
|
||||
if not year:
|
||||
year = 'XXXX'
|
||||
|
||||
# Clean last name (remove special characters)
|
||||
last_name = re.sub(r'[^a-zA-Z]', '', last_name)
|
||||
|
||||
# Get keyword from title
|
||||
title = metadata.get('title', '')
|
||||
words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
|
||||
keyword = words[0].lower() if words else 'paper'
|
||||
|
||||
return f'{last_name}{year}{keyword}'
|
||||
|
||||
def _protect_title(self, title: str) -> str:
|
||||
"""Protect capitalization in title for BibTeX."""
|
||||
# Protect common acronyms and proper nouns
|
||||
protected_words = [
|
||||
'DNA', 'RNA', 'CRISPR', 'COVID', 'HIV', 'AIDS', 'AlphaFold',
|
||||
'Python', 'AI', 'ML', 'GPU', 'CPU', 'USA', 'UK', 'EU'
|
||||
]
|
||||
|
||||
for word in protected_words:
|
||||
title = re.sub(rf'\b{word}\b', f'{{{word}}}', title, flags=re.IGNORECASE)
|
||||
|
||||
return title
|
||||
|
||||
def extract(self, identifier: str) -> Optional[str]:
|
||||
"""
|
||||
Extract metadata and return BibTeX.
|
||||
|
||||
Args:
|
||||
identifier: DOI, PMID, arXiv ID, or URL
|
||||
|
||||
Returns:
|
||||
BibTeX string or None
|
||||
"""
|
||||
id_type, clean_id = self.identify_type(identifier)
|
||||
|
||||
print(f'Identified as {id_type}: {clean_id}', file=sys.stderr)
|
||||
|
||||
metadata = None
|
||||
|
||||
if id_type == 'doi':
|
||||
metadata = self.extract_from_doi(clean_id)
|
||||
elif id_type == 'pmid':
|
||||
metadata = self.extract_from_pmid(clean_id)
|
||||
elif id_type == 'arxiv':
|
||||
metadata = self.extract_from_arxiv(clean_id)
|
||||
else:
|
||||
print(f'Error: Unknown identifier type: {identifier}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
if metadata:
|
||||
return self.metadata_to_bibtex(metadata)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract citation metadata from DOI, PMID, arXiv ID, or URL',
|
||||
epilog='Example: python extract_metadata.py --doi 10.1038/s41586-021-03819-2'
|
||||
)
|
||||
|
||||
parser.add_argument('--doi', help='Digital Object Identifier')
|
||||
parser.add_argument('--pmid', help='PubMed ID')
|
||||
parser.add_argument('--arxiv', help='arXiv ID')
|
||||
parser.add_argument('--url', help='URL to article')
|
||||
parser.add_argument('-i', '--input', help='Input file with identifiers (one per line)')
|
||||
parser.add_argument('-o', '--output', help='Output file for BibTeX (default: stdout)')
|
||||
parser.add_argument('--format', choices=['bibtex', 'json'], default='bibtex', help='Output format')
|
||||
parser.add_argument('--email', help='Email for NCBI E-utilities (recommended)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Collect identifiers
|
||||
identifiers = []
|
||||
if args.doi:
|
||||
identifiers.append(args.doi)
|
||||
if args.pmid:
|
||||
identifiers.append(args.pmid)
|
||||
if args.arxiv:
|
||||
identifiers.append(args.arxiv)
|
||||
if args.url:
|
||||
identifiers.append(args.url)
|
||||
|
||||
if args.input:
|
||||
try:
|
||||
with open(args.input, 'r', encoding='utf-8') as f:
|
||||
file_ids = [line.strip() for line in f if line.strip()]
|
||||
identifiers.extend(file_ids)
|
||||
except Exception as e:
|
||||
print(f'Error reading input file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not identifiers:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Extract metadata
|
||||
extractor = MetadataExtractor(email=args.email)
|
||||
bibtex_entries = []
|
||||
|
||||
for i, identifier in enumerate(identifiers):
|
||||
print(f'\nProcessing {i+1}/{len(identifiers)}...', file=sys.stderr)
|
||||
bibtex = extractor.extract(identifier)
|
||||
if bibtex:
|
||||
bibtex_entries.append(bibtex)
|
||||
|
||||
# Rate limiting
|
||||
if i < len(identifiers) - 1:
|
||||
time.sleep(0.5)
|
||||
|
||||
if not bibtex_entries:
|
||||
print('Error: No successful extractions', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Format output
|
||||
if args.format == 'bibtex':
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
else: # json
|
||||
output = json.dumps({
|
||||
'count': len(bibtex_entries),
|
||||
'entries': bibtex_entries
|
||||
}, indent=2)
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'\nSuccessfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
print(f'\nExtracted {len(bibtex_entries)}/{len(identifiers)} entries', file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
349
skills/citation-management/scripts/format_bibtex.py
Executable file
349
skills/citation-management/scripts/format_bibtex.py
Executable file
@@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BibTeX Formatter and Cleaner
|
||||
Format, clean, sort, and deduplicate BibTeX files.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
from typing import List, Dict, Tuple
|
||||
from collections import OrderedDict
|
||||
|
||||
class BibTeXFormatter:
|
||||
"""Format and clean BibTeX entries."""
|
||||
|
||||
def __init__(self):
|
||||
# Standard field order for readability
|
||||
self.field_order = [
|
||||
'author', 'editor', 'title', 'booktitle', 'journal',
|
||||
'year', 'month', 'volume', 'number', 'pages',
|
||||
'publisher', 'address', 'edition', 'series',
|
||||
'school', 'institution', 'organization',
|
||||
'howpublished', 'doi', 'url', 'isbn', 'issn',
|
||||
'note', 'abstract', 'keywords'
|
||||
]
|
||||
|
||||
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
|
||||
"""
|
||||
Parse BibTeX file and extract entries.
|
||||
|
||||
Args:
|
||||
filepath: Path to BibTeX file
|
||||
|
||||
Returns:
|
||||
List of entry dictionaries
|
||||
"""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
print(f'Error reading file: {e}', file=sys.stderr)
|
||||
return []
|
||||
|
||||
entries = []
|
||||
|
||||
# Match BibTeX entries
|
||||
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
|
||||
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
entry_type = match.group(1).lower()
|
||||
citation_key = match.group(2).strip()
|
||||
fields_text = match.group(3)
|
||||
|
||||
# Parse fields
|
||||
fields = OrderedDict()
|
||||
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
|
||||
field_matches = re.finditer(field_pattern, fields_text)
|
||||
|
||||
for field_match in field_matches:
|
||||
if field_match.group(1):
|
||||
field_name = field_match.group(1).lower()
|
||||
field_value = field_match.group(2)
|
||||
else:
|
||||
field_name = field_match.group(3).lower()
|
||||
field_value = field_match.group(4)
|
||||
|
||||
fields[field_name] = field_value.strip()
|
||||
|
||||
entries.append({
|
||||
'type': entry_type,
|
||||
'key': citation_key,
|
||||
'fields': fields
|
||||
})
|
||||
|
||||
return entries
|
||||
|
||||
def format_entry(self, entry: Dict) -> str:
|
||||
"""
|
||||
Format a single BibTeX entry.
|
||||
|
||||
Args:
|
||||
entry: Entry dictionary
|
||||
|
||||
Returns:
|
||||
Formatted BibTeX string
|
||||
"""
|
||||
lines = [f'@{entry["type"]}{{{entry["key"]},']
|
||||
|
||||
# Order fields according to standard order
|
||||
ordered_fields = OrderedDict()
|
||||
|
||||
# Add fields in standard order
|
||||
for field_name in self.field_order:
|
||||
if field_name in entry['fields']:
|
||||
ordered_fields[field_name] = entry['fields'][field_name]
|
||||
|
||||
# Add any remaining fields
|
||||
for field_name, field_value in entry['fields'].items():
|
||||
if field_name not in ordered_fields:
|
||||
ordered_fields[field_name] = field_value
|
||||
|
||||
# Format each field
|
||||
max_field_len = max(len(f) for f in ordered_fields.keys()) if ordered_fields else 0
|
||||
|
||||
for field_name, field_value in ordered_fields.items():
|
||||
# Pad field name for alignment
|
||||
padded_field = field_name.ljust(max_field_len)
|
||||
lines.append(f' {padded_field} = {{{field_value}}},')
|
||||
|
||||
# Remove trailing comma from last field
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def fix_common_issues(self, entry: Dict) -> Dict:
|
||||
"""
|
||||
Fix common formatting issues in entry.
|
||||
|
||||
Args:
|
||||
entry: Entry dictionary
|
||||
|
||||
Returns:
|
||||
Fixed entry dictionary
|
||||
"""
|
||||
fixed = entry.copy()
|
||||
fields = fixed['fields'].copy()
|
||||
|
||||
# Fix page ranges (single hyphen to double hyphen)
|
||||
if 'pages' in fields:
|
||||
pages = fields['pages']
|
||||
# Replace single hyphen with double hyphen if it's a range
|
||||
if re.search(r'\d-\d', pages) and '--' not in pages:
|
||||
pages = re.sub(r'(\d)-(\d)', r'\1--\2', pages)
|
||||
fields['pages'] = pages
|
||||
|
||||
# Remove "pp." from pages
|
||||
if 'pages' in fields:
|
||||
pages = fields['pages']
|
||||
pages = re.sub(r'^pp\.\s*', '', pages, flags=re.IGNORECASE)
|
||||
fields['pages'] = pages
|
||||
|
||||
# Fix DOI (remove URL prefix if present)
|
||||
if 'doi' in fields:
|
||||
doi = fields['doi']
|
||||
doi = doi.replace('https://doi.org/', '')
|
||||
doi = doi.replace('http://doi.org/', '')
|
||||
doi = doi.replace('doi:', '')
|
||||
fields['doi'] = doi
|
||||
|
||||
# Fix author separators (semicolon or ampersand to 'and')
|
||||
if 'author' in fields:
|
||||
author = fields['author']
|
||||
author = author.replace(';', ' and')
|
||||
author = author.replace(' & ', ' and ')
|
||||
# Clean up multiple 'and's
|
||||
author = re.sub(r'\s+and\s+and\s+', ' and ', author)
|
||||
fields['author'] = author
|
||||
|
||||
fixed['fields'] = fields
|
||||
return fixed
|
||||
|
||||
def deduplicate_entries(self, entries: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Remove duplicate entries based on DOI or citation key.
|
||||
|
||||
Args:
|
||||
entries: List of entry dictionaries
|
||||
|
||||
Returns:
|
||||
List of unique entries
|
||||
"""
|
||||
seen_dois = set()
|
||||
seen_keys = set()
|
||||
unique_entries = []
|
||||
|
||||
for entry in entries:
|
||||
doi = entry['fields'].get('doi', '').strip()
|
||||
key = entry['key']
|
||||
|
||||
# Check DOI first (more reliable)
|
||||
if doi:
|
||||
if doi in seen_dois:
|
||||
print(f'Duplicate DOI found: {doi} (skipping {key})', file=sys.stderr)
|
||||
continue
|
||||
seen_dois.add(doi)
|
||||
|
||||
# Check citation key
|
||||
if key in seen_keys:
|
||||
print(f'Duplicate citation key found: {key} (skipping)', file=sys.stderr)
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
|
||||
unique_entries.append(entry)
|
||||
|
||||
return unique_entries
|
||||
|
||||
def sort_entries(self, entries: List[Dict], sort_by: str = 'key', descending: bool = False) -> List[Dict]:
|
||||
"""
|
||||
Sort entries by specified field.
|
||||
|
||||
Args:
|
||||
entries: List of entry dictionaries
|
||||
sort_by: Field to sort by ('key', 'year', 'author', 'title')
|
||||
descending: Sort in descending order
|
||||
|
||||
Returns:
|
||||
Sorted list of entries
|
||||
"""
|
||||
def get_sort_key(entry: Dict) -> str:
|
||||
if sort_by == 'key':
|
||||
return entry['key'].lower()
|
||||
elif sort_by == 'year':
|
||||
year = entry['fields'].get('year', '9999')
|
||||
return year
|
||||
elif sort_by == 'author':
|
||||
author = entry['fields'].get('author', 'ZZZ')
|
||||
# Get last name of first author
|
||||
if ',' in author:
|
||||
return author.split(',')[0].lower()
|
||||
else:
|
||||
return author.split()[0].lower() if author else 'zzz'
|
||||
elif sort_by == 'title':
|
||||
return entry['fields'].get('title', '').lower()
|
||||
else:
|
||||
return entry['key'].lower()
|
||||
|
||||
return sorted(entries, key=get_sort_key, reverse=descending)
|
||||
|
||||
def format_file(self, filepath: str, output: str = None,
|
||||
deduplicate: bool = False, sort_by: str = None,
|
||||
descending: bool = False, fix_issues: bool = True) -> None:
|
||||
"""
|
||||
Format entire BibTeX file.
|
||||
|
||||
Args:
|
||||
filepath: Input BibTeX file
|
||||
output: Output file (None for in-place)
|
||||
deduplicate: Remove duplicates
|
||||
sort_by: Field to sort by
|
||||
descending: Sort in descending order
|
||||
fix_issues: Fix common formatting issues
|
||||
"""
|
||||
print(f'Parsing {filepath}...', file=sys.stderr)
|
||||
entries = self.parse_bibtex_file(filepath)
|
||||
|
||||
if not entries:
|
||||
print('No entries found', file=sys.stderr)
|
||||
return
|
||||
|
||||
print(f'Found {len(entries)} entries', file=sys.stderr)
|
||||
|
||||
# Fix common issues
|
||||
if fix_issues:
|
||||
print('Fixing common issues...', file=sys.stderr)
|
||||
entries = [self.fix_common_issues(e) for e in entries]
|
||||
|
||||
# Deduplicate
|
||||
if deduplicate:
|
||||
print('Removing duplicates...', file=sys.stderr)
|
||||
original_count = len(entries)
|
||||
entries = self.deduplicate_entries(entries)
|
||||
removed = original_count - len(entries)
|
||||
if removed > 0:
|
||||
print(f'Removed {removed} duplicate(s)', file=sys.stderr)
|
||||
|
||||
# Sort
|
||||
if sort_by:
|
||||
print(f'Sorting by {sort_by}...', file=sys.stderr)
|
||||
entries = self.sort_entries(entries, sort_by, descending)
|
||||
|
||||
# Format entries
|
||||
print('Formatting entries...', file=sys.stderr)
|
||||
formatted_entries = [self.format_entry(e) for e in entries]
|
||||
|
||||
# Write output
|
||||
output_content = '\n\n'.join(formatted_entries) + '\n'
|
||||
|
||||
output_file = output or filepath
|
||||
try:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(output_content)
|
||||
print(f'Successfully wrote {len(entries)} entries to {output_file}', file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f'Error writing file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Format, clean, sort, and deduplicate BibTeX files',
|
||||
epilog='Example: python format_bibtex.py references.bib --deduplicate --sort year'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'file',
|
||||
help='BibTeX file to format'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file (default: overwrite input file)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--deduplicate',
|
||||
action='store_true',
|
||||
help='Remove duplicate entries'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--sort',
|
||||
choices=['key', 'year', 'author', 'title'],
|
||||
help='Sort entries by field'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--descending',
|
||||
action='store_true',
|
||||
help='Sort in descending order'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-fix',
|
||||
action='store_true',
|
||||
help='Do not fix common issues'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Format file
|
||||
formatter = BibTeXFormatter()
|
||||
formatter.format_file(
|
||||
args.file,
|
||||
output=args.output,
|
||||
deduplicate=args.deduplicate,
|
||||
sort_by=args.sort,
|
||||
descending=args.descending,
|
||||
fix_issues=not args.no_fix
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
282
skills/citation-management/scripts/search_google_scholar.py
Executable file
282
skills/citation-management/scripts/search_google_scholar.py
Executable file
@@ -0,0 +1,282 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Google Scholar Search Tool
|
||||
Search Google Scholar and export results.
|
||||
|
||||
Note: This script requires the 'scholarly' library.
|
||||
Install with: pip install scholarly
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
try:
|
||||
from scholarly import scholarly, ProxyGenerator
|
||||
SCHOLARLY_AVAILABLE = True
|
||||
except ImportError:
|
||||
SCHOLARLY_AVAILABLE = False
|
||||
print('Warning: scholarly library not installed. Install with: pip install scholarly', file=sys.stderr)
|
||||
|
||||
class GoogleScholarSearcher:
|
||||
"""Search Google Scholar using scholarly library."""
|
||||
|
||||
def __init__(self, use_proxy: bool = False):
|
||||
"""
|
||||
Initialize searcher.
|
||||
|
||||
Args:
|
||||
use_proxy: Use free proxy (helps avoid rate limiting)
|
||||
"""
|
||||
if not SCHOLARLY_AVAILABLE:
|
||||
raise ImportError('scholarly library required. Install with: pip install scholarly')
|
||||
|
||||
# Setup proxy if requested
|
||||
if use_proxy:
|
||||
try:
|
||||
pg = ProxyGenerator()
|
||||
pg.FreeProxies()
|
||||
scholarly.use_proxy(pg)
|
||||
print('Using free proxy', file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f'Warning: Could not setup proxy: {e}', file=sys.stderr)
|
||||
|
||||
def search(self, query: str, max_results: int = 50,
|
||||
year_start: Optional[int] = None, year_end: Optional[int] = None,
|
||||
sort_by: str = 'relevance') -> List[Dict]:
|
||||
"""
|
||||
Search Google Scholar.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
max_results: Maximum number of results
|
||||
year_start: Start year filter
|
||||
year_end: End year filter
|
||||
sort_by: Sort order ('relevance' or 'citations')
|
||||
|
||||
Returns:
|
||||
List of result dictionaries
|
||||
"""
|
||||
if not SCHOLARLY_AVAILABLE:
|
||||
print('Error: scholarly library not installed', file=sys.stderr)
|
||||
return []
|
||||
|
||||
print(f'Searching Google Scholar: {query}', file=sys.stderr)
|
||||
print(f'Max results: {max_results}', file=sys.stderr)
|
||||
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Perform search
|
||||
search_query = scholarly.search_pubs(query)
|
||||
|
||||
for i, result in enumerate(search_query):
|
||||
if i >= max_results:
|
||||
break
|
||||
|
||||
print(f'Retrieved {i+1}/{max_results}', file=sys.stderr)
|
||||
|
||||
# Extract metadata
|
||||
metadata = {
|
||||
'title': result.get('bib', {}).get('title', ''),
|
||||
'authors': ', '.join(result.get('bib', {}).get('author', [])),
|
||||
'year': result.get('bib', {}).get('pub_year', ''),
|
||||
'venue': result.get('bib', {}).get('venue', ''),
|
||||
'abstract': result.get('bib', {}).get('abstract', ''),
|
||||
'citations': result.get('num_citations', 0),
|
||||
'url': result.get('pub_url', ''),
|
||||
'eprint_url': result.get('eprint_url', ''),
|
||||
}
|
||||
|
||||
# Filter by year
|
||||
if year_start or year_end:
|
||||
try:
|
||||
pub_year = int(metadata['year']) if metadata['year'] else 0
|
||||
if year_start and pub_year < year_start:
|
||||
continue
|
||||
if year_end and pub_year > year_end:
|
||||
continue
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
results.append(metadata)
|
||||
|
||||
# Rate limiting to avoid blocking
|
||||
time.sleep(random.uniform(2, 5))
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error during search: {e}', file=sys.stderr)
|
||||
|
||||
# Sort if requested
|
||||
if sort_by == 'citations' and results:
|
||||
results.sort(key=lambda x: x.get('citations', 0), reverse=True)
|
||||
|
||||
return results
|
||||
|
||||
def metadata_to_bibtex(self, metadata: Dict) -> str:
|
||||
"""Convert metadata to BibTeX format."""
|
||||
# Generate citation key
|
||||
if metadata.get('authors'):
|
||||
first_author = metadata['authors'].split(',')[0].strip()
|
||||
last_name = first_author.split()[-1] if first_author else 'Unknown'
|
||||
else:
|
||||
last_name = 'Unknown'
|
||||
|
||||
year = metadata.get('year', 'XXXX')
|
||||
|
||||
# Get keyword from title
|
||||
import re
|
||||
title = metadata.get('title', '')
|
||||
words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
|
||||
keyword = words[0].lower() if words else 'paper'
|
||||
|
||||
citation_key = f'{last_name}{year}{keyword}'
|
||||
|
||||
# Determine entry type (guess based on venue)
|
||||
venue = metadata.get('venue', '').lower()
|
||||
if 'proceedings' in venue or 'conference' in venue:
|
||||
entry_type = 'inproceedings'
|
||||
venue_field = 'booktitle'
|
||||
else:
|
||||
entry_type = 'article'
|
||||
venue_field = 'journal'
|
||||
|
||||
# Build BibTeX
|
||||
lines = [f'@{entry_type}{{{citation_key},']
|
||||
|
||||
# Convert authors format
|
||||
if metadata.get('authors'):
|
||||
authors = metadata['authors'].replace(',', ' and')
|
||||
lines.append(f' author = {{{authors}}},')
|
||||
|
||||
if metadata.get('title'):
|
||||
lines.append(f' title = {{{metadata["title"]}}},')
|
||||
|
||||
if metadata.get('venue'):
|
||||
lines.append(f' {venue_field} = {{{metadata["venue"]}}},')
|
||||
|
||||
if metadata.get('year'):
|
||||
lines.append(f' year = {{{metadata["year"]}}},')
|
||||
|
||||
if metadata.get('url'):
|
||||
lines.append(f' url = {{{metadata["url"]}}},')
|
||||
|
||||
if metadata.get('citations'):
|
||||
lines.append(f' note = {{Cited by: {metadata["citations"]}}},')
|
||||
|
||||
# Remove trailing comma
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Search Google Scholar (requires scholarly library)',
|
||||
epilog='Example: python search_google_scholar.py "machine learning" --limit 50'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'query',
|
||||
help='Search query'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
default=50,
|
||||
help='Maximum number of results (default: 50)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--year-start',
|
||||
type=int,
|
||||
help='Start year for filtering'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--year-end',
|
||||
type=int,
|
||||
help='End year for filtering'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--sort-by',
|
||||
choices=['relevance', 'citations'],
|
||||
default='relevance',
|
||||
help='Sort order (default: relevance)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--use-proxy',
|
||||
action='store_true',
|
||||
help='Use free proxy to avoid rate limiting'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file (default: stdout)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['json', 'bibtex'],
|
||||
default='json',
|
||||
help='Output format (default: json)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not SCHOLARLY_AVAILABLE:
|
||||
print('\nError: scholarly library not installed', file=sys.stderr)
|
||||
print('Install with: pip install scholarly', file=sys.stderr)
|
||||
print('\nAlternatively, use PubMed search for biomedical literature:', file=sys.stderr)
|
||||
print(' python search_pubmed.py "your query"', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Search
|
||||
searcher = GoogleScholarSearcher(use_proxy=args.use_proxy)
|
||||
results = searcher.search(
|
||||
args.query,
|
||||
max_results=args.limit,
|
||||
year_start=args.year_start,
|
||||
year_end=args.year_end,
|
||||
sort_by=args.sort_by
|
||||
)
|
||||
|
||||
if not results:
|
||||
print('No results found', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Format output
|
||||
if args.format == 'json':
|
||||
output = json.dumps({
|
||||
'query': args.query,
|
||||
'count': len(results),
|
||||
'results': results
|
||||
}, indent=2)
|
||||
else: # bibtex
|
||||
bibtex_entries = [searcher.metadata_to_bibtex(r) for r in results]
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'Wrote {len(results)} results to {args.output}', file=sys.stderr)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
print(f'\nRetrieved {len(results)} results', file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
398
skills/citation-management/scripts/search_pubmed.py
Executable file
398
skills/citation-management/scripts/search_pubmed.py
Executable file
@@ -0,0 +1,398 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PubMed Search Tool
|
||||
Search PubMed using E-utilities API and export results.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
|
||||
class PubMedSearcher:
|
||||
"""Search PubMed using NCBI E-utilities API."""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, email: Optional[str] = None):
|
||||
"""
|
||||
Initialize searcher.
|
||||
|
||||
Args:
|
||||
api_key: NCBI API key (optional but recommended)
|
||||
email: Email for Entrez (optional but recommended)
|
||||
"""
|
||||
self.api_key = api_key or os.getenv('NCBI_API_KEY', '')
|
||||
self.email = email or os.getenv('NCBI_EMAIL', '')
|
||||
self.base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
||||
self.session = requests.Session()
|
||||
|
||||
# Rate limiting
|
||||
self.delay = 0.11 if self.api_key else 0.34 # 10/sec with key, 3/sec without
|
||||
|
||||
def search(self, query: str, max_results: int = 100,
|
||||
date_start: Optional[str] = None, date_end: Optional[str] = None,
|
||||
publication_types: Optional[List[str]] = None) -> List[str]:
|
||||
"""
|
||||
Search PubMed and return PMIDs.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
max_results: Maximum number of results
|
||||
date_start: Start date (YYYY/MM/DD or YYYY)
|
||||
date_end: End date (YYYY/MM/DD or YYYY)
|
||||
publication_types: List of publication types to filter
|
||||
|
||||
Returns:
|
||||
List of PMIDs
|
||||
"""
|
||||
# Build query with filters
|
||||
full_query = query
|
||||
|
||||
# Add date range
|
||||
if date_start or date_end:
|
||||
start = date_start or '1900'
|
||||
end = date_end or datetime.now().strftime('%Y')
|
||||
full_query += f' AND {start}:{end}[Publication Date]'
|
||||
|
||||
# Add publication types
|
||||
if publication_types:
|
||||
pub_type_query = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
|
||||
full_query += f' AND ({pub_type_query})'
|
||||
|
||||
print(f'Searching PubMed: {full_query}', file=sys.stderr)
|
||||
|
||||
# ESearch to get PMIDs
|
||||
esearch_url = self.base_url + 'esearch.fcgi'
|
||||
params = {
|
||||
'db': 'pubmed',
|
||||
'term': full_query,
|
||||
'retmax': max_results,
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if self.email:
|
||||
params['email'] = self.email
|
||||
if self.api_key:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(esearch_url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
pmids = data['esearchresult']['idlist']
|
||||
count = int(data['esearchresult']['count'])
|
||||
|
||||
print(f'Found {count} results, retrieving {len(pmids)}', file=sys.stderr)
|
||||
|
||||
return pmids
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error searching PubMed: {e}', file=sys.stderr)
|
||||
return []
|
||||
|
||||
def fetch_metadata(self, pmids: List[str]) -> List[Dict]:
|
||||
"""
|
||||
Fetch metadata for PMIDs.
|
||||
|
||||
Args:
|
||||
pmids: List of PubMed IDs
|
||||
|
||||
Returns:
|
||||
List of metadata dictionaries
|
||||
"""
|
||||
if not pmids:
|
||||
return []
|
||||
|
||||
metadata_list = []
|
||||
|
||||
# Fetch in batches of 200
|
||||
batch_size = 200
|
||||
for i in range(0, len(pmids), batch_size):
|
||||
batch = pmids[i:i+batch_size]
|
||||
print(f'Fetching metadata for PMIDs {i+1}-{min(i+batch_size, len(pmids))}...', file=sys.stderr)
|
||||
|
||||
efetch_url = self.base_url + 'efetch.fcgi'
|
||||
params = {
|
||||
'db': 'pubmed',
|
||||
'id': ','.join(batch),
|
||||
'retmode': 'xml',
|
||||
'rettype': 'abstract'
|
||||
}
|
||||
|
||||
if self.email:
|
||||
params['email'] = self.email
|
||||
if self.api_key:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(efetch_url, params=params, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(response.content)
|
||||
articles = root.findall('.//PubmedArticle')
|
||||
|
||||
for article in articles:
|
||||
metadata = self._extract_metadata_from_xml(article)
|
||||
if metadata:
|
||||
metadata_list.append(metadata)
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(self.delay)
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error fetching metadata for batch: {e}', file=sys.stderr)
|
||||
continue
|
||||
|
||||
return metadata_list
|
||||
|
||||
def _extract_metadata_from_xml(self, article: ET.Element) -> Optional[Dict]:
|
||||
"""Extract metadata from PubmedArticle XML element."""
|
||||
try:
|
||||
medline_citation = article.find('.//MedlineCitation')
|
||||
article_elem = medline_citation.find('.//Article')
|
||||
journal = article_elem.find('.//Journal')
|
||||
|
||||
# Get PMID
|
||||
pmid = medline_citation.findtext('.//PMID', '')
|
||||
|
||||
# Get DOI
|
||||
doi = None
|
||||
article_ids = article.findall('.//ArticleId')
|
||||
for article_id in article_ids:
|
||||
if article_id.get('IdType') == 'doi':
|
||||
doi = article_id.text
|
||||
break
|
||||
|
||||
# Get authors
|
||||
authors = []
|
||||
author_list = article_elem.find('.//AuthorList')
|
||||
if author_list is not None:
|
||||
for author in author_list.findall('.//Author'):
|
||||
last_name = author.findtext('.//LastName', '')
|
||||
fore_name = author.findtext('.//ForeName', '')
|
||||
if last_name:
|
||||
if fore_name:
|
||||
authors.append(f'{last_name}, {fore_name}')
|
||||
else:
|
||||
authors.append(last_name)
|
||||
|
||||
# Get year
|
||||
year = article_elem.findtext('.//Journal/JournalIssue/PubDate/Year', '')
|
||||
if not year:
|
||||
medline_date = article_elem.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
|
||||
if medline_date:
|
||||
import re
|
||||
year_match = re.search(r'\d{4}', medline_date)
|
||||
if year_match:
|
||||
year = year_match.group()
|
||||
|
||||
metadata = {
|
||||
'pmid': pmid,
|
||||
'doi': doi,
|
||||
'title': article_elem.findtext('.//ArticleTitle', ''),
|
||||
'authors': ' and '.join(authors),
|
||||
'journal': journal.findtext('.//Title', ''),
|
||||
'year': year,
|
||||
'volume': journal.findtext('.//JournalIssue/Volume', ''),
|
||||
'issue': journal.findtext('.//JournalIssue/Issue', ''),
|
||||
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
|
||||
'abstract': article_elem.findtext('.//Abstract/AbstractText', '')
|
||||
}
|
||||
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def metadata_to_bibtex(self, metadata: Dict) -> str:
|
||||
"""Convert metadata to BibTeX format."""
|
||||
# Generate citation key
|
||||
if metadata.get('authors'):
|
||||
first_author = metadata['authors'].split(' and ')[0]
|
||||
if ',' in first_author:
|
||||
last_name = first_author.split(',')[0].strip()
|
||||
else:
|
||||
last_name = first_author.split()[0]
|
||||
else:
|
||||
last_name = 'Unknown'
|
||||
|
||||
year = metadata.get('year', 'XXXX')
|
||||
citation_key = f'{last_name}{year}pmid{metadata.get("pmid", "")}'
|
||||
|
||||
# Build BibTeX entry
|
||||
lines = [f'@article{{{citation_key},']
|
||||
|
||||
if metadata.get('authors'):
|
||||
lines.append(f' author = {{{metadata["authors"]}}},')
|
||||
|
||||
if metadata.get('title'):
|
||||
lines.append(f' title = {{{metadata["title"]}}},')
|
||||
|
||||
if metadata.get('journal'):
|
||||
lines.append(f' journal = {{{metadata["journal"]}}},')
|
||||
|
||||
if metadata.get('year'):
|
||||
lines.append(f' year = {{{metadata["year"]}}},')
|
||||
|
||||
if metadata.get('volume'):
|
||||
lines.append(f' volume = {{{metadata["volume"]}}},')
|
||||
|
||||
if metadata.get('issue'):
|
||||
lines.append(f' number = {{{metadata["issue"]}}},')
|
||||
|
||||
if metadata.get('pages'):
|
||||
pages = metadata['pages'].replace('-', '--')
|
||||
lines.append(f' pages = {{{pages}}},')
|
||||
|
||||
if metadata.get('doi'):
|
||||
lines.append(f' doi = {{{metadata["doi"]}}},')
|
||||
|
||||
if metadata.get('pmid'):
|
||||
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
|
||||
|
||||
# Remove trailing comma
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Search PubMed using E-utilities API',
|
||||
epilog='Example: python search_pubmed.py "CRISPR gene editing" --limit 100'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'query',
|
||||
nargs='?',
|
||||
help='Search query (PubMed syntax)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--query',
|
||||
dest='query_arg',
|
||||
help='Search query (alternative to positional argument)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--query-file',
|
||||
help='File containing search query'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
default=100,
|
||||
help='Maximum number of results (default: 100)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--date-start',
|
||||
help='Start date (YYYY/MM/DD or YYYY)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--date-end',
|
||||
help='End date (YYYY/MM/DD or YYYY)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-types',
|
||||
help='Comma-separated publication types (e.g., "Review,Clinical Trial")'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file (default: stdout)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['json', 'bibtex'],
|
||||
default='json',
|
||||
help='Output format (default: json)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--api-key',
|
||||
help='NCBI API key (or set NCBI_API_KEY env var)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--email',
|
||||
help='Email for Entrez (or set NCBI_EMAIL env var)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get query
|
||||
query = args.query or args.query_arg
|
||||
|
||||
if args.query_file:
|
||||
try:
|
||||
with open(args.query_file, 'r', encoding='utf-8') as f:
|
||||
query = f.read().strip()
|
||||
except Exception as e:
|
||||
print(f'Error reading query file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not query:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Parse publication types
|
||||
pub_types = None
|
||||
if args.publication_types:
|
||||
pub_types = [pt.strip() for pt in args.publication_types.split(',')]
|
||||
|
||||
# Search PubMed
|
||||
searcher = PubMedSearcher(api_key=args.api_key, email=args.email)
|
||||
pmids = searcher.search(
|
||||
query,
|
||||
max_results=args.limit,
|
||||
date_start=args.date_start,
|
||||
date_end=args.date_end,
|
||||
publication_types=pub_types
|
||||
)
|
||||
|
||||
if not pmids:
|
||||
print('No results found', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Fetch metadata
|
||||
metadata_list = searcher.fetch_metadata(pmids)
|
||||
|
||||
# Format output
|
||||
if args.format == 'json':
|
||||
output = json.dumps({
|
||||
'query': query,
|
||||
'count': len(metadata_list),
|
||||
'results': metadata_list
|
||||
}, indent=2)
|
||||
else: # bibtex
|
||||
bibtex_entries = [searcher.metadata_to_bibtex(m) for m in metadata_list]
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'Wrote {len(metadata_list)} results to {args.output}', file=sys.stderr)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
497
skills/citation-management/scripts/validate_citations.py
Executable file
497
skills/citation-management/scripts/validate_citations.py
Executable file
@@ -0,0 +1,497 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Citation Validation Tool
|
||||
Validate BibTeX files for accuracy, completeness, and format compliance.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import requests
|
||||
import argparse
|
||||
import json
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
class CitationValidator:
|
||||
"""Validate BibTeX entries for errors and inconsistencies."""
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'CitationValidator/1.0 (Citation Management Tool)'
|
||||
})
|
||||
|
||||
# Required fields by entry type
|
||||
self.required_fields = {
|
||||
'article': ['author', 'title', 'journal', 'year'],
|
||||
'book': ['title', 'publisher', 'year'], # author OR editor
|
||||
'inproceedings': ['author', 'title', 'booktitle', 'year'],
|
||||
'incollection': ['author', 'title', 'booktitle', 'publisher', 'year'],
|
||||
'phdthesis': ['author', 'title', 'school', 'year'],
|
||||
'mastersthesis': ['author', 'title', 'school', 'year'],
|
||||
'techreport': ['author', 'title', 'institution', 'year'],
|
||||
'misc': ['title', 'year']
|
||||
}
|
||||
|
||||
# Recommended fields
|
||||
self.recommended_fields = {
|
||||
'article': ['volume', 'pages', 'doi'],
|
||||
'book': ['isbn'],
|
||||
'inproceedings': ['pages'],
|
||||
}
|
||||
|
||||
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
|
||||
"""
|
||||
Parse BibTeX file and extract entries.
|
||||
|
||||
Args:
|
||||
filepath: Path to BibTeX file
|
||||
|
||||
Returns:
|
||||
List of entry dictionaries
|
||||
"""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
print(f'Error reading file: {e}', file=sys.stderr)
|
||||
return []
|
||||
|
||||
entries = []
|
||||
|
||||
# Match BibTeX entries
|
||||
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
|
||||
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
entry_type = match.group(1).lower()
|
||||
citation_key = match.group(2).strip()
|
||||
fields_text = match.group(3)
|
||||
|
||||
# Parse fields
|
||||
fields = {}
|
||||
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
|
||||
field_matches = re.finditer(field_pattern, fields_text)
|
||||
|
||||
for field_match in field_matches:
|
||||
if field_match.group(1):
|
||||
field_name = field_match.group(1).lower()
|
||||
field_value = field_match.group(2)
|
||||
else:
|
||||
field_name = field_match.group(3).lower()
|
||||
field_value = field_match.group(4)
|
||||
|
||||
fields[field_name] = field_value.strip()
|
||||
|
||||
entries.append({
|
||||
'type': entry_type,
|
||||
'key': citation_key,
|
||||
'fields': fields,
|
||||
'raw': match.group(0)
|
||||
})
|
||||
|
||||
return entries
|
||||
|
||||
def validate_entry(self, entry: Dict) -> Tuple[List[Dict], List[Dict]]:
|
||||
"""
|
||||
Validate a single BibTeX entry.
|
||||
|
||||
Args:
|
||||
entry: Entry dictionary
|
||||
|
||||
Returns:
|
||||
Tuple of (errors, warnings)
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
entry_type = entry['type']
|
||||
key = entry['key']
|
||||
fields = entry['fields']
|
||||
|
||||
# Check required fields
|
||||
if entry_type in self.required_fields:
|
||||
for req_field in self.required_fields[entry_type]:
|
||||
if req_field not in fields or not fields[req_field]:
|
||||
# Special case: book can have author OR editor
|
||||
if entry_type == 'book' and req_field == 'author':
|
||||
if 'editor' not in fields or not fields['editor']:
|
||||
errors.append({
|
||||
'type': 'missing_required_field',
|
||||
'field': 'author or editor',
|
||||
'severity': 'high',
|
||||
'message': f'Entry {key}: Missing required field "author" or "editor"'
|
||||
})
|
||||
else:
|
||||
errors.append({
|
||||
'type': 'missing_required_field',
|
||||
'field': req_field,
|
||||
'severity': 'high',
|
||||
'message': f'Entry {key}: Missing required field "{req_field}"'
|
||||
})
|
||||
|
||||
# Check recommended fields
|
||||
if entry_type in self.recommended_fields:
|
||||
for rec_field in self.recommended_fields[entry_type]:
|
||||
if rec_field not in fields or not fields[rec_field]:
|
||||
warnings.append({
|
||||
'type': 'missing_recommended_field',
|
||||
'field': rec_field,
|
||||
'severity': 'medium',
|
||||
'message': f'Entry {key}: Missing recommended field "{rec_field}"'
|
||||
})
|
||||
|
||||
# Validate year
|
||||
if 'year' in fields:
|
||||
year = fields['year']
|
||||
if not re.match(r'^\d{4}$', year):
|
||||
errors.append({
|
||||
'type': 'invalid_year',
|
||||
'field': 'year',
|
||||
'value': year,
|
||||
'severity': 'high',
|
||||
'message': f'Entry {key}: Invalid year format "{year}" (should be 4 digits)'
|
||||
})
|
||||
elif int(year) < 1600 or int(year) > 2030:
|
||||
warnings.append({
|
||||
'type': 'suspicious_year',
|
||||
'field': 'year',
|
||||
'value': year,
|
||||
'severity': 'medium',
|
||||
'message': f'Entry {key}: Suspicious year "{year}" (outside reasonable range)'
|
||||
})
|
||||
|
||||
# Validate DOI format
|
||||
if 'doi' in fields:
|
||||
doi = fields['doi']
|
||||
if not re.match(r'^10\.\d{4,}/[^\s]+$', doi):
|
||||
warnings.append({
|
||||
'type': 'invalid_doi_format',
|
||||
'field': 'doi',
|
||||
'value': doi,
|
||||
'severity': 'medium',
|
||||
'message': f'Entry {key}: Invalid DOI format "{doi}"'
|
||||
})
|
||||
|
||||
# Check for single hyphen in pages (should be --)
|
||||
if 'pages' in fields:
|
||||
pages = fields['pages']
|
||||
if re.search(r'\d-\d', pages) and '--' not in pages:
|
||||
warnings.append({
|
||||
'type': 'page_range_format',
|
||||
'field': 'pages',
|
||||
'value': pages,
|
||||
'severity': 'low',
|
||||
'message': f'Entry {key}: Page range uses single hyphen, should use -- (en-dash)'
|
||||
})
|
||||
|
||||
# Check author format
|
||||
if 'author' in fields:
|
||||
author = fields['author']
|
||||
if ';' in author or '&' in author:
|
||||
errors.append({
|
||||
'type': 'invalid_author_format',
|
||||
'field': 'author',
|
||||
'severity': 'high',
|
||||
'message': f'Entry {key}: Authors should be separated by " and ", not ";" or "&"'
|
||||
})
|
||||
|
||||
return errors, warnings
|
||||
|
||||
def verify_doi(self, doi: str) -> Tuple[bool, Optional[Dict]]:
|
||||
"""
|
||||
Verify DOI resolves correctly and get metadata.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, metadata)
|
||||
"""
|
||||
try:
|
||||
url = f'https://doi.org/{doi}'
|
||||
response = self.session.head(url, timeout=10, allow_redirects=True)
|
||||
|
||||
if response.status_code < 400:
|
||||
# DOI resolves, now get metadata from CrossRef
|
||||
crossref_url = f'https://api.crossref.org/works/{doi}'
|
||||
metadata_response = self.session.get(crossref_url, timeout=10)
|
||||
|
||||
if metadata_response.status_code == 200:
|
||||
data = metadata_response.json()
|
||||
message = data.get('message', {})
|
||||
|
||||
# Extract key metadata
|
||||
metadata = {
|
||||
'title': message.get('title', [''])[0],
|
||||
'year': self._extract_year_crossref(message),
|
||||
'authors': self._format_authors_crossref(message.get('author', [])),
|
||||
}
|
||||
return True, metadata
|
||||
else:
|
||||
return True, None # DOI resolves but no CrossRef metadata
|
||||
else:
|
||||
return False, None
|
||||
|
||||
except Exception:
|
||||
return False, None
|
||||
|
||||
def detect_duplicates(self, entries: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Detect duplicate entries.
|
||||
|
||||
Args:
|
||||
entries: List of entry dictionaries
|
||||
|
||||
Returns:
|
||||
List of duplicate groups
|
||||
"""
|
||||
duplicates = []
|
||||
|
||||
# Check for duplicate DOIs
|
||||
doi_map = defaultdict(list)
|
||||
for entry in entries:
|
||||
doi = entry['fields'].get('doi', '').strip()
|
||||
if doi:
|
||||
doi_map[doi].append(entry['key'])
|
||||
|
||||
for doi, keys in doi_map.items():
|
||||
if len(keys) > 1:
|
||||
duplicates.append({
|
||||
'type': 'duplicate_doi',
|
||||
'doi': doi,
|
||||
'entries': keys,
|
||||
'severity': 'high',
|
||||
'message': f'Duplicate DOI {doi} found in entries: {", ".join(keys)}'
|
||||
})
|
||||
|
||||
# Check for duplicate citation keys
|
||||
key_counts = defaultdict(int)
|
||||
for entry in entries:
|
||||
key_counts[entry['key']] += 1
|
||||
|
||||
for key, count in key_counts.items():
|
||||
if count > 1:
|
||||
duplicates.append({
|
||||
'type': 'duplicate_key',
|
||||
'key': key,
|
||||
'count': count,
|
||||
'severity': 'high',
|
||||
'message': f'Citation key "{key}" appears {count} times'
|
||||
})
|
||||
|
||||
# Check for similar titles (possible duplicates)
|
||||
titles = {}
|
||||
for entry in entries:
|
||||
title = entry['fields'].get('title', '').lower()
|
||||
title = re.sub(r'[^\w\s]', '', title) # Remove punctuation
|
||||
title = ' '.join(title.split()) # Normalize whitespace
|
||||
|
||||
if title:
|
||||
if title in titles:
|
||||
duplicates.append({
|
||||
'type': 'similar_title',
|
||||
'entries': [titles[title], entry['key']],
|
||||
'severity': 'medium',
|
||||
'message': f'Possible duplicate: "{titles[title]}" and "{entry["key"]}" have identical titles'
|
||||
})
|
||||
else:
|
||||
titles[title] = entry['key']
|
||||
|
||||
return duplicates
|
||||
|
||||
def validate_file(self, filepath: str, check_dois: bool = False) -> Dict:
|
||||
"""
|
||||
Validate entire BibTeX file.
|
||||
|
||||
Args:
|
||||
filepath: Path to BibTeX file
|
||||
check_dois: Whether to verify DOIs (slow)
|
||||
|
||||
Returns:
|
||||
Validation report dictionary
|
||||
"""
|
||||
print(f'Parsing {filepath}...', file=sys.stderr)
|
||||
entries = self.parse_bibtex_file(filepath)
|
||||
|
||||
if not entries:
|
||||
return {
|
||||
'total_entries': 0,
|
||||
'errors': [],
|
||||
'warnings': [],
|
||||
'duplicates': []
|
||||
}
|
||||
|
||||
print(f'Found {len(entries)} entries', file=sys.stderr)
|
||||
|
||||
all_errors = []
|
||||
all_warnings = []
|
||||
|
||||
# Validate each entry
|
||||
for i, entry in enumerate(entries):
|
||||
print(f'Validating entry {i+1}/{len(entries)}: {entry["key"]}', file=sys.stderr)
|
||||
errors, warnings = self.validate_entry(entry)
|
||||
|
||||
for error in errors:
|
||||
error['entry'] = entry['key']
|
||||
all_errors.append(error)
|
||||
|
||||
for warning in warnings:
|
||||
warning['entry'] = entry['key']
|
||||
all_warnings.append(warning)
|
||||
|
||||
# Check for duplicates
|
||||
print('Checking for duplicates...', file=sys.stderr)
|
||||
duplicates = self.detect_duplicates(entries)
|
||||
|
||||
# Verify DOIs if requested
|
||||
doi_errors = []
|
||||
if check_dois:
|
||||
print('Verifying DOIs...', file=sys.stderr)
|
||||
for i, entry in enumerate(entries):
|
||||
doi = entry['fields'].get('doi', '')
|
||||
if doi:
|
||||
print(f'Verifying DOI {i+1}: {doi}', file=sys.stderr)
|
||||
is_valid, metadata = self.verify_doi(doi)
|
||||
|
||||
if not is_valid:
|
||||
doi_errors.append({
|
||||
'type': 'invalid_doi',
|
||||
'entry': entry['key'],
|
||||
'doi': doi,
|
||||
'severity': 'high',
|
||||
'message': f'Entry {entry["key"]}: DOI does not resolve: {doi}'
|
||||
})
|
||||
|
||||
all_errors.extend(doi_errors)
|
||||
|
||||
return {
|
||||
'filepath': filepath,
|
||||
'total_entries': len(entries),
|
||||
'valid_entries': len(entries) - len([e for e in all_errors if e['severity'] == 'high']),
|
||||
'errors': all_errors,
|
||||
'warnings': all_warnings,
|
||||
'duplicates': duplicates
|
||||
}
|
||||
|
||||
def _extract_year_crossref(self, message: Dict) -> str:
|
||||
"""Extract year from CrossRef message."""
|
||||
date_parts = message.get('published-print', {}).get('date-parts', [[]])
|
||||
if not date_parts or not date_parts[0]:
|
||||
date_parts = message.get('published-online', {}).get('date-parts', [[]])
|
||||
|
||||
if date_parts and date_parts[0]:
|
||||
return str(date_parts[0][0])
|
||||
return ''
|
||||
|
||||
def _format_authors_crossref(self, authors: List[Dict]) -> str:
|
||||
"""Format author list from CrossRef."""
|
||||
if not authors:
|
||||
return ''
|
||||
|
||||
formatted = []
|
||||
for author in authors[:3]: # First 3 authors
|
||||
given = author.get('given', '')
|
||||
family = author.get('family', '')
|
||||
if family:
|
||||
formatted.append(f'{family}, {given}' if given else family)
|
||||
|
||||
if len(authors) > 3:
|
||||
formatted.append('et al.')
|
||||
|
||||
return ', '.join(formatted)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate BibTeX files for errors and inconsistencies',
|
||||
epilog='Example: python validate_citations.py references.bib'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'file',
|
||||
help='BibTeX file to validate'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--check-dois',
|
||||
action='store_true',
|
||||
help='Verify DOIs resolve correctly (slow)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--auto-fix',
|
||||
action='store_true',
|
||||
help='Attempt to auto-fix common issues (not implemented yet)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--report',
|
||||
help='Output file for JSON validation report'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Show detailed output'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate file
|
||||
validator = CitationValidator()
|
||||
report = validator.validate_file(args.file, check_dois=args.check_dois)
|
||||
|
||||
# Print summary
|
||||
print('\n' + '='*60)
|
||||
print('CITATION VALIDATION REPORT')
|
||||
print('='*60)
|
||||
print(f'\nFile: {args.file}')
|
||||
print(f'Total entries: {report["total_entries"]}')
|
||||
print(f'Valid entries: {report["valid_entries"]}')
|
||||
print(f'Errors: {len(report["errors"])}')
|
||||
print(f'Warnings: {len(report["warnings"])}')
|
||||
print(f'Duplicates: {len(report["duplicates"])}')
|
||||
|
||||
# Print errors
|
||||
if report['errors']:
|
||||
print('\n' + '-'*60)
|
||||
print('ERRORS (must fix):')
|
||||
print('-'*60)
|
||||
for error in report['errors']:
|
||||
print(f'\n{error["message"]}')
|
||||
if args.verbose:
|
||||
print(f' Type: {error["type"]}')
|
||||
print(f' Severity: {error["severity"]}')
|
||||
|
||||
# Print warnings
|
||||
if report['warnings'] and args.verbose:
|
||||
print('\n' + '-'*60)
|
||||
print('WARNINGS (should fix):')
|
||||
print('-'*60)
|
||||
for warning in report['warnings']:
|
||||
print(f'\n{warning["message"]}')
|
||||
|
||||
# Print duplicates
|
||||
if report['duplicates']:
|
||||
print('\n' + '-'*60)
|
||||
print('DUPLICATES:')
|
||||
print('-'*60)
|
||||
for dup in report['duplicates']:
|
||||
print(f'\n{dup["message"]}')
|
||||
|
||||
# Save report
|
||||
if args.report:
|
||||
with open(args.report, 'w', encoding='utf-8') as f:
|
||||
json.dump(report, f, indent=2)
|
||||
print(f'\nDetailed report saved to: {args.report}')
|
||||
|
||||
# Exit with error code if there are errors
|
||||
if report['errors']:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user