Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions

View File

@@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
DOI to BibTeX Converter
Quick utility to convert DOIs to BibTeX format using CrossRef API.
"""
import sys
import requests
import argparse
import time
import json
from typing import Optional, List
class DOIConverter:
"""Convert DOIs to BibTeX entries using CrossRef API."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'DOIConverter/1.0 (Citation Management Tool; mailto:support@example.com)'
})
def doi_to_bibtex(self, doi: str) -> Optional[str]:
"""
Convert a single DOI to BibTeX format.
Args:
doi: Digital Object Identifier
Returns:
BibTeX string or None if conversion fails
"""
# Clean DOI (remove URL prefix if present)
doi = doi.strip()
if doi.startswith('https://doi.org/'):
doi = doi.replace('https://doi.org/', '')
elif doi.startswith('http://doi.org/'):
doi = doi.replace('http://doi.org/', '')
elif doi.startswith('doi:'):
doi = doi.replace('doi:', '')
# Request BibTeX from CrossRef content negotiation
url = f'https://doi.org/{doi}'
headers = {
'Accept': 'application/x-bibtex',
'User-Agent': 'DOIConverter/1.0 (Citation Management Tool)'
}
try:
response = self.session.get(url, headers=headers, timeout=15)
if response.status_code == 200:
bibtex = response.text.strip()
# CrossRef sometimes returns entries with @data type, convert to @misc
if bibtex.startswith('@data{'):
bibtex = bibtex.replace('@data{', '@misc{', 1)
return bibtex
elif response.status_code == 404:
print(f'Error: DOI not found: {doi}', file=sys.stderr)
return None
else:
print(f'Error: Failed to retrieve BibTeX for {doi} (status {response.status_code})', file=sys.stderr)
return None
except requests.exceptions.Timeout:
print(f'Error: Request timeout for DOI: {doi}', file=sys.stderr)
return None
except requests.exceptions.RequestException as e:
print(f'Error: Request failed for {doi}: {e}', file=sys.stderr)
return None
def convert_multiple(self, dois: List[str], delay: float = 0.5) -> List[str]:
"""
Convert multiple DOIs to BibTeX.
Args:
dois: List of DOIs
delay: Delay between requests (seconds) for rate limiting
Returns:
List of BibTeX entries (excludes failed conversions)
"""
bibtex_entries = []
for i, doi in enumerate(dois):
print(f'Converting DOI {i+1}/{len(dois)}: {doi}', file=sys.stderr)
bibtex = self.doi_to_bibtex(doi)
if bibtex:
bibtex_entries.append(bibtex)
# Rate limiting
if i < len(dois) - 1: # Don't delay after last request
time.sleep(delay)
return bibtex_entries
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Convert DOIs to BibTeX format using CrossRef API',
epilog='Example: python doi_to_bibtex.py 10.1038/s41586-021-03819-2'
)
parser.add_argument(
'dois',
nargs='*',
help='DOI(s) to convert (can provide multiple)'
)
parser.add_argument(
'-i', '--input',
help='Input file with DOIs (one per line)'
)
parser.add_argument(
'-o', '--output',
help='Output file for BibTeX (default: stdout)'
)
parser.add_argument(
'--delay',
type=float,
default=0.5,
help='Delay between requests in seconds (default: 0.5)'
)
parser.add_argument(
'--format',
choices=['bibtex', 'json'],
default='bibtex',
help='Output format (default: bibtex)'
)
args = parser.parse_args()
# Collect DOIs from command line and/or file
dois = []
if args.dois:
dois.extend(args.dois)
if args.input:
try:
with open(args.input, 'r', encoding='utf-8') as f:
file_dois = [line.strip() for line in f if line.strip()]
dois.extend(file_dois)
except FileNotFoundError:
print(f'Error: Input file not found: {args.input}', file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f'Error reading input file: {e}', file=sys.stderr)
sys.exit(1)
if not dois:
parser.print_help()
sys.exit(1)
# Convert DOIs
converter = DOIConverter()
if len(dois) == 1:
bibtex = converter.doi_to_bibtex(dois[0])
if bibtex:
bibtex_entries = [bibtex]
else:
sys.exit(1)
else:
bibtex_entries = converter.convert_multiple(dois, delay=args.delay)
if not bibtex_entries:
print('Error: No successful conversions', file=sys.stderr)
sys.exit(1)
# Format output
if args.format == 'bibtex':
output = '\n\n'.join(bibtex_entries) + '\n'
else: # json
output = json.dumps({
'count': len(bibtex_entries),
'entries': bibtex_entries
}, indent=2)
# Write output
if args.output:
try:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f'Successfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
except Exception as e:
print(f'Error writing output file: {e}', file=sys.stderr)
sys.exit(1)
else:
print(output)
# Summary
if len(dois) > 1:
success_rate = len(bibtex_entries) / len(dois) * 100
print(f'\nConverted {len(bibtex_entries)}/{len(dois)} DOIs ({success_rate:.1f}%)', file=sys.stderr)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,569 @@
#!/usr/bin/env python3
"""
Metadata Extraction Tool
Extract citation metadata from DOI, PMID, arXiv ID, or URL using various APIs.
"""
import sys
import os
import requests
import argparse
import time
import re
import json
import xml.etree.ElementTree as ET
from typing import Optional, Dict, List, Tuple
from urllib.parse import urlparse
class MetadataExtractor:
"""Extract metadata from various sources and generate BibTeX."""
def __init__(self, email: Optional[str] = None):
"""
Initialize extractor.
Args:
email: Email for Entrez API (recommended for PubMed)
"""
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'MetadataExtractor/1.0 (Citation Management Tool)'
})
self.email = email or os.getenv('NCBI_EMAIL', '')
def identify_type(self, identifier: str) -> Tuple[str, str]:
"""
Identify the type of identifier.
Args:
identifier: DOI, PMID, arXiv ID, or URL
Returns:
Tuple of (type, cleaned_identifier)
"""
identifier = identifier.strip()
# Check if URL
if identifier.startswith('http://') or identifier.startswith('https://'):
return self._parse_url(identifier)
# Check for DOI
if identifier.startswith('10.'):
return ('doi', identifier)
# Check for arXiv ID
if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', identifier):
return ('arxiv', identifier)
if identifier.startswith('arXiv:'):
return ('arxiv', identifier.replace('arXiv:', ''))
# Check for PMID (8-digit number typically)
if identifier.isdigit() and len(identifier) >= 7:
return ('pmid', identifier)
# Check for PMCID
if identifier.upper().startswith('PMC') and identifier[3:].isdigit():
return ('pmcid', identifier.upper())
return ('unknown', identifier)
def _parse_url(self, url: str) -> Tuple[str, str]:
"""Parse URL to extract identifier type and value."""
parsed = urlparse(url)
# DOI URLs
if 'doi.org' in parsed.netloc:
doi = parsed.path.lstrip('/')
return ('doi', doi)
# PubMed URLs
if 'pubmed.ncbi.nlm.nih.gov' in parsed.netloc or 'ncbi.nlm.nih.gov/pubmed' in url:
pmid = re.search(r'/(\d+)', parsed.path)
if pmid:
return ('pmid', pmid.group(1))
# arXiv URLs
if 'arxiv.org' in parsed.netloc:
arxiv_id = re.search(r'/abs/(\d{4}\.\d{4,5})', parsed.path)
if arxiv_id:
return ('arxiv', arxiv_id.group(1))
# Nature, Science, Cell, etc. - try to extract DOI from URL
doi_match = re.search(r'10\.\d{4,}/[^\s/]+', url)
if doi_match:
return ('doi', doi_match.group())
return ('url', url)
def extract_from_doi(self, doi: str) -> Optional[Dict]:
"""
Extract metadata from DOI using CrossRef API.
Args:
doi: Digital Object Identifier
Returns:
Metadata dictionary or None
"""
url = f'https://api.crossref.org/works/{doi}'
try:
response = self.session.get(url, timeout=15)
if response.status_code == 200:
data = response.json()
message = data.get('message', {})
metadata = {
'type': 'doi',
'entry_type': self._crossref_type_to_bibtex(message.get('type')),
'doi': doi,
'title': message.get('title', [''])[0],
'authors': self._format_authors_crossref(message.get('author', [])),
'year': self._extract_year_crossref(message),
'journal': message.get('container-title', [''])[0] if message.get('container-title') else '',
'volume': str(message.get('volume', '')) if message.get('volume') else '',
'issue': str(message.get('issue', '')) if message.get('issue') else '',
'pages': message.get('page', ''),
'publisher': message.get('publisher', ''),
'url': f'https://doi.org/{doi}'
}
return metadata
else:
print(f'Error: CrossRef API returned status {response.status_code} for DOI: {doi}', file=sys.stderr)
return None
except Exception as e:
print(f'Error extracting metadata from DOI {doi}: {e}', file=sys.stderr)
return None
def extract_from_pmid(self, pmid: str) -> Optional[Dict]:
"""
Extract metadata from PMID using PubMed E-utilities.
Args:
pmid: PubMed ID
Returns:
Metadata dictionary or None
"""
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
params = {
'db': 'pubmed',
'id': pmid,
'retmode': 'xml',
'rettype': 'abstract'
}
if self.email:
params['email'] = self.email
api_key = os.getenv('NCBI_API_KEY')
if api_key:
params['api_key'] = api_key
try:
response = self.session.get(url, params=params, timeout=15)
if response.status_code == 200:
root = ET.fromstring(response.content)
article = root.find('.//PubmedArticle')
if article is None:
print(f'Error: No article found for PMID: {pmid}', file=sys.stderr)
return None
# Extract metadata from XML
medline_citation = article.find('.//MedlineCitation')
article_elem = medline_citation.find('.//Article')
journal = article_elem.find('.//Journal')
# Get DOI if available
doi = None
article_ids = article.findall('.//ArticleId')
for article_id in article_ids:
if article_id.get('IdType') == 'doi':
doi = article_id.text
break
metadata = {
'type': 'pmid',
'entry_type': 'article',
'pmid': pmid,
'title': article_elem.findtext('.//ArticleTitle', ''),
'authors': self._format_authors_pubmed(article_elem.findall('.//Author')),
'year': self._extract_year_pubmed(article_elem),
'journal': journal.findtext('.//Title', ''),
'volume': journal.findtext('.//JournalIssue/Volume', ''),
'issue': journal.findtext('.//JournalIssue/Issue', ''),
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
'doi': doi
}
return metadata
else:
print(f'Error: PubMed API returned status {response.status_code} for PMID: {pmid}', file=sys.stderr)
return None
except Exception as e:
print(f'Error extracting metadata from PMID {pmid}: {e}', file=sys.stderr)
return None
def extract_from_arxiv(self, arxiv_id: str) -> Optional[Dict]:
"""
Extract metadata from arXiv ID using arXiv API.
Args:
arxiv_id: arXiv identifier
Returns:
Metadata dictionary or None
"""
url = 'http://export.arxiv.org/api/query'
params = {
'id_list': arxiv_id,
'max_results': 1
}
try:
response = self.session.get(url, params=params, timeout=15)
if response.status_code == 200:
# Parse Atom XML
root = ET.fromstring(response.content)
ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
entry = root.find('atom:entry', ns)
if entry is None:
print(f'Error: No entry found for arXiv ID: {arxiv_id}', file=sys.stderr)
return None
# Extract DOI if published
doi_elem = entry.find('arxiv:doi', ns)
doi = doi_elem.text if doi_elem is not None else None
# Extract journal reference if published
journal_ref_elem = entry.find('arxiv:journal_ref', ns)
journal_ref = journal_ref_elem.text if journal_ref_elem is not None else None
# Get publication date
published = entry.findtext('atom:published', '', ns)
year = published[:4] if published else ''
# Get authors
authors = []
for author in entry.findall('atom:author', ns):
name = author.findtext('atom:name', '', ns)
if name:
authors.append(name)
metadata = {
'type': 'arxiv',
'entry_type': 'misc' if not doi else 'article',
'arxiv_id': arxiv_id,
'title': entry.findtext('atom:title', '', ns).strip().replace('\n', ' '),
'authors': ' and '.join(authors),
'year': year,
'doi': doi,
'journal_ref': journal_ref,
'abstract': entry.findtext('atom:summary', '', ns).strip().replace('\n', ' '),
'url': f'https://arxiv.org/abs/{arxiv_id}'
}
return metadata
else:
print(f'Error: arXiv API returned status {response.status_code} for ID: {arxiv_id}', file=sys.stderr)
return None
except Exception as e:
print(f'Error extracting metadata from arXiv {arxiv_id}: {e}', file=sys.stderr)
return None
def metadata_to_bibtex(self, metadata: Dict, citation_key: Optional[str] = None) -> str:
"""
Convert metadata dictionary to BibTeX format.
Args:
metadata: Metadata dictionary
citation_key: Optional custom citation key
Returns:
BibTeX string
"""
if not citation_key:
citation_key = self._generate_citation_key(metadata)
entry_type = metadata.get('entry_type', 'misc')
# Build BibTeX entry
lines = [f'@{entry_type}{{{citation_key},']
# Add fields
if metadata.get('authors'):
lines.append(f' author = {{{metadata["authors"]}}},')
if metadata.get('title'):
# Protect capitalization
title = self._protect_title(metadata['title'])
lines.append(f' title = {{{title}}},')
if entry_type == 'article' and metadata.get('journal'):
lines.append(f' journal = {{{metadata["journal"]}}},')
elif entry_type == 'misc' and metadata.get('type') == 'arxiv':
lines.append(f' howpublished = {{arXiv}},')
if metadata.get('year'):
lines.append(f' year = {{{metadata["year"]}}},')
if metadata.get('volume'):
lines.append(f' volume = {{{metadata["volume"]}}},')
if metadata.get('issue'):
lines.append(f' number = {{{metadata["issue"]}}},')
if metadata.get('pages'):
pages = metadata['pages'].replace('-', '--') # En-dash
lines.append(f' pages = {{{pages}}},')
if metadata.get('doi'):
lines.append(f' doi = {{{metadata["doi"]}}},')
elif metadata.get('url'):
lines.append(f' url = {{{metadata["url"]}}},')
if metadata.get('pmid'):
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
if metadata.get('type') == 'arxiv' and not metadata.get('doi'):
lines.append(f' note = {{Preprint}},')
# Remove trailing comma from last field
if lines[-1].endswith(','):
lines[-1] = lines[-1][:-1]
lines.append('}')
return '\n'.join(lines)
def _crossref_type_to_bibtex(self, crossref_type: str) -> str:
"""Map CrossRef type to BibTeX entry type."""
type_map = {
'journal-article': 'article',
'book': 'book',
'book-chapter': 'incollection',
'proceedings-article': 'inproceedings',
'posted-content': 'misc',
'dataset': 'misc',
'report': 'techreport'
}
return type_map.get(crossref_type, 'misc')
def _format_authors_crossref(self, authors: List[Dict]) -> str:
"""Format author list from CrossRef data."""
if not authors:
return ''
formatted = []
for author in authors:
given = author.get('given', '')
family = author.get('family', '')
if family:
if given:
formatted.append(f'{family}, {given}')
else:
formatted.append(family)
return ' and '.join(formatted)
def _format_authors_pubmed(self, authors: List) -> str:
"""Format author list from PubMed XML."""
formatted = []
for author in authors:
last_name = author.findtext('.//LastName', '')
fore_name = author.findtext('.//ForeName', '')
if last_name:
if fore_name:
formatted.append(f'{last_name}, {fore_name}')
else:
formatted.append(last_name)
return ' and '.join(formatted)
def _extract_year_crossref(self, message: Dict) -> str:
"""Extract year from CrossRef message."""
# Try published-print first, then published-online
date_parts = message.get('published-print', {}).get('date-parts', [[]])
if not date_parts or not date_parts[0]:
date_parts = message.get('published-online', {}).get('date-parts', [[]])
if date_parts and date_parts[0]:
return str(date_parts[0][0])
return ''
def _extract_year_pubmed(self, article: ET.Element) -> str:
"""Extract year from PubMed XML."""
year = article.findtext('.//Journal/JournalIssue/PubDate/Year', '')
if not year:
medline_date = article.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
if medline_date:
year_match = re.search(r'\d{4}', medline_date)
if year_match:
year = year_match.group()
return year
def _generate_citation_key(self, metadata: Dict) -> str:
"""Generate a citation key from metadata."""
# Get first author last name
authors = metadata.get('authors', '')
if authors:
first_author = authors.split(' and ')[0]
if ',' in first_author:
last_name = first_author.split(',')[0].strip()
else:
last_name = first_author.split()[-1] if first_author else 'Unknown'
else:
last_name = 'Unknown'
# Get year
year = metadata.get('year', '').strip()
if not year:
year = 'XXXX'
# Clean last name (remove special characters)
last_name = re.sub(r'[^a-zA-Z]', '', last_name)
# Get keyword from title
title = metadata.get('title', '')
words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
keyword = words[0].lower() if words else 'paper'
return f'{last_name}{year}{keyword}'
def _protect_title(self, title: str) -> str:
"""Protect capitalization in title for BibTeX."""
# Protect common acronyms and proper nouns
protected_words = [
'DNA', 'RNA', 'CRISPR', 'COVID', 'HIV', 'AIDS', 'AlphaFold',
'Python', 'AI', 'ML', 'GPU', 'CPU', 'USA', 'UK', 'EU'
]
for word in protected_words:
title = re.sub(rf'\b{word}\b', f'{{{word}}}', title, flags=re.IGNORECASE)
return title
def extract(self, identifier: str) -> Optional[str]:
"""
Extract metadata and return BibTeX.
Args:
identifier: DOI, PMID, arXiv ID, or URL
Returns:
BibTeX string or None
"""
id_type, clean_id = self.identify_type(identifier)
print(f'Identified as {id_type}: {clean_id}', file=sys.stderr)
metadata = None
if id_type == 'doi':
metadata = self.extract_from_doi(clean_id)
elif id_type == 'pmid':
metadata = self.extract_from_pmid(clean_id)
elif id_type == 'arxiv':
metadata = self.extract_from_arxiv(clean_id)
else:
print(f'Error: Unknown identifier type: {identifier}', file=sys.stderr)
return None
if metadata:
return self.metadata_to_bibtex(metadata)
else:
return None
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Extract citation metadata from DOI, PMID, arXiv ID, or URL',
epilog='Example: python extract_metadata.py --doi 10.1038/s41586-021-03819-2'
)
parser.add_argument('--doi', help='Digital Object Identifier')
parser.add_argument('--pmid', help='PubMed ID')
parser.add_argument('--arxiv', help='arXiv ID')
parser.add_argument('--url', help='URL to article')
parser.add_argument('-i', '--input', help='Input file with identifiers (one per line)')
parser.add_argument('-o', '--output', help='Output file for BibTeX (default: stdout)')
parser.add_argument('--format', choices=['bibtex', 'json'], default='bibtex', help='Output format')
parser.add_argument('--email', help='Email for NCBI E-utilities (recommended)')
args = parser.parse_args()
# Collect identifiers
identifiers = []
if args.doi:
identifiers.append(args.doi)
if args.pmid:
identifiers.append(args.pmid)
if args.arxiv:
identifiers.append(args.arxiv)
if args.url:
identifiers.append(args.url)
if args.input:
try:
with open(args.input, 'r', encoding='utf-8') as f:
file_ids = [line.strip() for line in f if line.strip()]
identifiers.extend(file_ids)
except Exception as e:
print(f'Error reading input file: {e}', file=sys.stderr)
sys.exit(1)
if not identifiers:
parser.print_help()
sys.exit(1)
# Extract metadata
extractor = MetadataExtractor(email=args.email)
bibtex_entries = []
for i, identifier in enumerate(identifiers):
print(f'\nProcessing {i+1}/{len(identifiers)}...', file=sys.stderr)
bibtex = extractor.extract(identifier)
if bibtex:
bibtex_entries.append(bibtex)
# Rate limiting
if i < len(identifiers) - 1:
time.sleep(0.5)
if not bibtex_entries:
print('Error: No successful extractions', file=sys.stderr)
sys.exit(1)
# Format output
if args.format == 'bibtex':
output = '\n\n'.join(bibtex_entries) + '\n'
else: # json
output = json.dumps({
'count': len(bibtex_entries),
'entries': bibtex_entries
}, indent=2)
# Write output
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f'\nSuccessfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
else:
print(output)
print(f'\nExtracted {len(bibtex_entries)}/{len(identifiers)} entries', file=sys.stderr)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,349 @@
#!/usr/bin/env python3
"""
BibTeX Formatter and Cleaner
Format, clean, sort, and deduplicate BibTeX files.
"""
import sys
import re
import argparse
from typing import List, Dict, Tuple
from collections import OrderedDict
class BibTeXFormatter:
"""Format and clean BibTeX entries."""
def __init__(self):
# Standard field order for readability
self.field_order = [
'author', 'editor', 'title', 'booktitle', 'journal',
'year', 'month', 'volume', 'number', 'pages',
'publisher', 'address', 'edition', 'series',
'school', 'institution', 'organization',
'howpublished', 'doi', 'url', 'isbn', 'issn',
'note', 'abstract', 'keywords'
]
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
"""
Parse BibTeX file and extract entries.
Args:
filepath: Path to BibTeX file
Returns:
List of entry dictionaries
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f'Error reading file: {e}', file=sys.stderr)
return []
entries = []
# Match BibTeX entries
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
for match in matches:
entry_type = match.group(1).lower()
citation_key = match.group(2).strip()
fields_text = match.group(3)
# Parse fields
fields = OrderedDict()
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
field_matches = re.finditer(field_pattern, fields_text)
for field_match in field_matches:
if field_match.group(1):
field_name = field_match.group(1).lower()
field_value = field_match.group(2)
else:
field_name = field_match.group(3).lower()
field_value = field_match.group(4)
fields[field_name] = field_value.strip()
entries.append({
'type': entry_type,
'key': citation_key,
'fields': fields
})
return entries
def format_entry(self, entry: Dict) -> str:
"""
Format a single BibTeX entry.
Args:
entry: Entry dictionary
Returns:
Formatted BibTeX string
"""
lines = [f'@{entry["type"]}{{{entry["key"]},']
# Order fields according to standard order
ordered_fields = OrderedDict()
# Add fields in standard order
for field_name in self.field_order:
if field_name in entry['fields']:
ordered_fields[field_name] = entry['fields'][field_name]
# Add any remaining fields
for field_name, field_value in entry['fields'].items():
if field_name not in ordered_fields:
ordered_fields[field_name] = field_value
# Format each field
max_field_len = max(len(f) for f in ordered_fields.keys()) if ordered_fields else 0
for field_name, field_value in ordered_fields.items():
# Pad field name for alignment
padded_field = field_name.ljust(max_field_len)
lines.append(f' {padded_field} = {{{field_value}}},')
# Remove trailing comma from last field
if lines[-1].endswith(','):
lines[-1] = lines[-1][:-1]
lines.append('}')
return '\n'.join(lines)
def fix_common_issues(self, entry: Dict) -> Dict:
"""
Fix common formatting issues in entry.
Args:
entry: Entry dictionary
Returns:
Fixed entry dictionary
"""
fixed = entry.copy()
fields = fixed['fields'].copy()
# Fix page ranges (single hyphen to double hyphen)
if 'pages' in fields:
pages = fields['pages']
# Replace single hyphen with double hyphen if it's a range
if re.search(r'\d-\d', pages) and '--' not in pages:
pages = re.sub(r'(\d)-(\d)', r'\1--\2', pages)
fields['pages'] = pages
# Remove "pp." from pages
if 'pages' in fields:
pages = fields['pages']
pages = re.sub(r'^pp\.\s*', '', pages, flags=re.IGNORECASE)
fields['pages'] = pages
# Fix DOI (remove URL prefix if present)
if 'doi' in fields:
doi = fields['doi']
doi = doi.replace('https://doi.org/', '')
doi = doi.replace('http://doi.org/', '')
doi = doi.replace('doi:', '')
fields['doi'] = doi
# Fix author separators (semicolon or ampersand to 'and')
if 'author' in fields:
author = fields['author']
author = author.replace(';', ' and')
author = author.replace(' & ', ' and ')
# Clean up multiple 'and's
author = re.sub(r'\s+and\s+and\s+', ' and ', author)
fields['author'] = author
fixed['fields'] = fields
return fixed
def deduplicate_entries(self, entries: List[Dict]) -> List[Dict]:
"""
Remove duplicate entries based on DOI or citation key.
Args:
entries: List of entry dictionaries
Returns:
List of unique entries
"""
seen_dois = set()
seen_keys = set()
unique_entries = []
for entry in entries:
doi = entry['fields'].get('doi', '').strip()
key = entry['key']
# Check DOI first (more reliable)
if doi:
if doi in seen_dois:
print(f'Duplicate DOI found: {doi} (skipping {key})', file=sys.stderr)
continue
seen_dois.add(doi)
# Check citation key
if key in seen_keys:
print(f'Duplicate citation key found: {key} (skipping)', file=sys.stderr)
continue
seen_keys.add(key)
unique_entries.append(entry)
return unique_entries
def sort_entries(self, entries: List[Dict], sort_by: str = 'key', descending: bool = False) -> List[Dict]:
"""
Sort entries by specified field.
Args:
entries: List of entry dictionaries
sort_by: Field to sort by ('key', 'year', 'author', 'title')
descending: Sort in descending order
Returns:
Sorted list of entries
"""
def get_sort_key(entry: Dict) -> str:
if sort_by == 'key':
return entry['key'].lower()
elif sort_by == 'year':
year = entry['fields'].get('year', '9999')
return year
elif sort_by == 'author':
author = entry['fields'].get('author', 'ZZZ')
# Get last name of first author
if ',' in author:
return author.split(',')[0].lower()
else:
return author.split()[0].lower() if author else 'zzz'
elif sort_by == 'title':
return entry['fields'].get('title', '').lower()
else:
return entry['key'].lower()
return sorted(entries, key=get_sort_key, reverse=descending)
def format_file(self, filepath: str, output: str = None,
deduplicate: bool = False, sort_by: str = None,
descending: bool = False, fix_issues: bool = True) -> None:
"""
Format entire BibTeX file.
Args:
filepath: Input BibTeX file
output: Output file (None for in-place)
deduplicate: Remove duplicates
sort_by: Field to sort by
descending: Sort in descending order
fix_issues: Fix common formatting issues
"""
print(f'Parsing {filepath}...', file=sys.stderr)
entries = self.parse_bibtex_file(filepath)
if not entries:
print('No entries found', file=sys.stderr)
return
print(f'Found {len(entries)} entries', file=sys.stderr)
# Fix common issues
if fix_issues:
print('Fixing common issues...', file=sys.stderr)
entries = [self.fix_common_issues(e) for e in entries]
# Deduplicate
if deduplicate:
print('Removing duplicates...', file=sys.stderr)
original_count = len(entries)
entries = self.deduplicate_entries(entries)
removed = original_count - len(entries)
if removed > 0:
print(f'Removed {removed} duplicate(s)', file=sys.stderr)
# Sort
if sort_by:
print(f'Sorting by {sort_by}...', file=sys.stderr)
entries = self.sort_entries(entries, sort_by, descending)
# Format entries
print('Formatting entries...', file=sys.stderr)
formatted_entries = [self.format_entry(e) for e in entries]
# Write output
output_content = '\n\n'.join(formatted_entries) + '\n'
output_file = output or filepath
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(output_content)
print(f'Successfully wrote {len(entries)} entries to {output_file}', file=sys.stderr)
except Exception as e:
print(f'Error writing file: {e}', file=sys.stderr)
sys.exit(1)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Format, clean, sort, and deduplicate BibTeX files',
epilog='Example: python format_bibtex.py references.bib --deduplicate --sort year'
)
parser.add_argument(
'file',
help='BibTeX file to format'
)
parser.add_argument(
'-o', '--output',
help='Output file (default: overwrite input file)'
)
parser.add_argument(
'--deduplicate',
action='store_true',
help='Remove duplicate entries'
)
parser.add_argument(
'--sort',
choices=['key', 'year', 'author', 'title'],
help='Sort entries by field'
)
parser.add_argument(
'--descending',
action='store_true',
help='Sort in descending order'
)
parser.add_argument(
'--no-fix',
action='store_true',
help='Do not fix common issues'
)
args = parser.parse_args()
# Format file
formatter = BibTeXFormatter()
formatter.format_file(
args.file,
output=args.output,
deduplicate=args.deduplicate,
sort_by=args.sort,
descending=args.descending,
fix_issues=not args.no_fix
)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""
Google Scholar Search Tool
Search Google Scholar and export results.
Note: This script requires the 'scholarly' library.
Install with: pip install scholarly
"""
import sys
import argparse
import json
import time
import random
from typing import List, Dict, Optional
try:
from scholarly import scholarly, ProxyGenerator
SCHOLARLY_AVAILABLE = True
except ImportError:
SCHOLARLY_AVAILABLE = False
print('Warning: scholarly library not installed. Install with: pip install scholarly', file=sys.stderr)
class GoogleScholarSearcher:
"""Search Google Scholar using scholarly library."""
def __init__(self, use_proxy: bool = False):
"""
Initialize searcher.
Args:
use_proxy: Use free proxy (helps avoid rate limiting)
"""
if not SCHOLARLY_AVAILABLE:
raise ImportError('scholarly library required. Install with: pip install scholarly')
# Setup proxy if requested
if use_proxy:
try:
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)
print('Using free proxy', file=sys.stderr)
except Exception as e:
print(f'Warning: Could not setup proxy: {e}', file=sys.stderr)
def search(self, query: str, max_results: int = 50,
year_start: Optional[int] = None, year_end: Optional[int] = None,
sort_by: str = 'relevance') -> List[Dict]:
"""
Search Google Scholar.
Args:
query: Search query
max_results: Maximum number of results
year_start: Start year filter
year_end: End year filter
sort_by: Sort order ('relevance' or 'citations')
Returns:
List of result dictionaries
"""
if not SCHOLARLY_AVAILABLE:
print('Error: scholarly library not installed', file=sys.stderr)
return []
print(f'Searching Google Scholar: {query}', file=sys.stderr)
print(f'Max results: {max_results}', file=sys.stderr)
results = []
try:
# Perform search
search_query = scholarly.search_pubs(query)
for i, result in enumerate(search_query):
if i >= max_results:
break
print(f'Retrieved {i+1}/{max_results}', file=sys.stderr)
# Extract metadata
metadata = {
'title': result.get('bib', {}).get('title', ''),
'authors': ', '.join(result.get('bib', {}).get('author', [])),
'year': result.get('bib', {}).get('pub_year', ''),
'venue': result.get('bib', {}).get('venue', ''),
'abstract': result.get('bib', {}).get('abstract', ''),
'citations': result.get('num_citations', 0),
'url': result.get('pub_url', ''),
'eprint_url': result.get('eprint_url', ''),
}
# Filter by year
if year_start or year_end:
try:
pub_year = int(metadata['year']) if metadata['year'] else 0
if year_start and pub_year < year_start:
continue
if year_end and pub_year > year_end:
continue
except ValueError:
pass
results.append(metadata)
# Rate limiting to avoid blocking
time.sleep(random.uniform(2, 5))
except Exception as e:
print(f'Error during search: {e}', file=sys.stderr)
# Sort if requested
if sort_by == 'citations' and results:
results.sort(key=lambda x: x.get('citations', 0), reverse=True)
return results
def metadata_to_bibtex(self, metadata: Dict) -> str:
"""Convert metadata to BibTeX format."""
# Generate citation key
if metadata.get('authors'):
first_author = metadata['authors'].split(',')[0].strip()
last_name = first_author.split()[-1] if first_author else 'Unknown'
else:
last_name = 'Unknown'
year = metadata.get('year', 'XXXX')
# Get keyword from title
import re
title = metadata.get('title', '')
words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
keyword = words[0].lower() if words else 'paper'
citation_key = f'{last_name}{year}{keyword}'
# Determine entry type (guess based on venue)
venue = metadata.get('venue', '').lower()
if 'proceedings' in venue or 'conference' in venue:
entry_type = 'inproceedings'
venue_field = 'booktitle'
else:
entry_type = 'article'
venue_field = 'journal'
# Build BibTeX
lines = [f'@{entry_type}{{{citation_key},']
# Convert authors format
if metadata.get('authors'):
authors = metadata['authors'].replace(',', ' and')
lines.append(f' author = {{{authors}}},')
if metadata.get('title'):
lines.append(f' title = {{{metadata["title"]}}},')
if metadata.get('venue'):
lines.append(f' {venue_field} = {{{metadata["venue"]}}},')
if metadata.get('year'):
lines.append(f' year = {{{metadata["year"]}}},')
if metadata.get('url'):
lines.append(f' url = {{{metadata["url"]}}},')
if metadata.get('citations'):
lines.append(f' note = {{Cited by: {metadata["citations"]}}},')
# Remove trailing comma
if lines[-1].endswith(','):
lines[-1] = lines[-1][:-1]
lines.append('}')
return '\n'.join(lines)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Search Google Scholar (requires scholarly library)',
epilog='Example: python search_google_scholar.py "machine learning" --limit 50'
)
parser.add_argument(
'query',
help='Search query'
)
parser.add_argument(
'--limit',
type=int,
default=50,
help='Maximum number of results (default: 50)'
)
parser.add_argument(
'--year-start',
type=int,
help='Start year for filtering'
)
parser.add_argument(
'--year-end',
type=int,
help='End year for filtering'
)
parser.add_argument(
'--sort-by',
choices=['relevance', 'citations'],
default='relevance',
help='Sort order (default: relevance)'
)
parser.add_argument(
'--use-proxy',
action='store_true',
help='Use free proxy to avoid rate limiting'
)
parser.add_argument(
'-o', '--output',
help='Output file (default: stdout)'
)
parser.add_argument(
'--format',
choices=['json', 'bibtex'],
default='json',
help='Output format (default: json)'
)
args = parser.parse_args()
if not SCHOLARLY_AVAILABLE:
print('\nError: scholarly library not installed', file=sys.stderr)
print('Install with: pip install scholarly', file=sys.stderr)
print('\nAlternatively, use PubMed search for biomedical literature:', file=sys.stderr)
print(' python search_pubmed.py "your query"', file=sys.stderr)
sys.exit(1)
# Search
searcher = GoogleScholarSearcher(use_proxy=args.use_proxy)
results = searcher.search(
args.query,
max_results=args.limit,
year_start=args.year_start,
year_end=args.year_end,
sort_by=args.sort_by
)
if not results:
print('No results found', file=sys.stderr)
sys.exit(1)
# Format output
if args.format == 'json':
output = json.dumps({
'query': args.query,
'count': len(results),
'results': results
}, indent=2)
else: # bibtex
bibtex_entries = [searcher.metadata_to_bibtex(r) for r in results]
output = '\n\n'.join(bibtex_entries) + '\n'
# Write output
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f'Wrote {len(results)} results to {args.output}', file=sys.stderr)
else:
print(output)
print(f'\nRetrieved {len(results)} results', file=sys.stderr)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,398 @@
#!/usr/bin/env python3
"""
PubMed Search Tool
Search PubMed using E-utilities API and export results.
"""
import sys
import os
import requests
import argparse
import json
import time
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional
from datetime import datetime
class PubMedSearcher:
"""Search PubMed using NCBI E-utilities API."""
def __init__(self, api_key: Optional[str] = None, email: Optional[str] = None):
"""
Initialize searcher.
Args:
api_key: NCBI API key (optional but recommended)
email: Email for Entrez (optional but recommended)
"""
self.api_key = api_key or os.getenv('NCBI_API_KEY', '')
self.email = email or os.getenv('NCBI_EMAIL', '')
self.base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
self.session = requests.Session()
# Rate limiting
self.delay = 0.11 if self.api_key else 0.34 # 10/sec with key, 3/sec without
def search(self, query: str, max_results: int = 100,
date_start: Optional[str] = None, date_end: Optional[str] = None,
publication_types: Optional[List[str]] = None) -> List[str]:
"""
Search PubMed and return PMIDs.
Args:
query: Search query
max_results: Maximum number of results
date_start: Start date (YYYY/MM/DD or YYYY)
date_end: End date (YYYY/MM/DD or YYYY)
publication_types: List of publication types to filter
Returns:
List of PMIDs
"""
# Build query with filters
full_query = query
# Add date range
if date_start or date_end:
start = date_start or '1900'
end = date_end or datetime.now().strftime('%Y')
full_query += f' AND {start}:{end}[Publication Date]'
# Add publication types
if publication_types:
pub_type_query = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
full_query += f' AND ({pub_type_query})'
print(f'Searching PubMed: {full_query}', file=sys.stderr)
# ESearch to get PMIDs
esearch_url = self.base_url + 'esearch.fcgi'
params = {
'db': 'pubmed',
'term': full_query,
'retmax': max_results,
'retmode': 'json'
}
if self.email:
params['email'] = self.email
if self.api_key:
params['api_key'] = self.api_key
try:
response = self.session.get(esearch_url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
pmids = data['esearchresult']['idlist']
count = int(data['esearchresult']['count'])
print(f'Found {count} results, retrieving {len(pmids)}', file=sys.stderr)
return pmids
except Exception as e:
print(f'Error searching PubMed: {e}', file=sys.stderr)
return []
def fetch_metadata(self, pmids: List[str]) -> List[Dict]:
"""
Fetch metadata for PMIDs.
Args:
pmids: List of PubMed IDs
Returns:
List of metadata dictionaries
"""
if not pmids:
return []
metadata_list = []
# Fetch in batches of 200
batch_size = 200
for i in range(0, len(pmids), batch_size):
batch = pmids[i:i+batch_size]
print(f'Fetching metadata for PMIDs {i+1}-{min(i+batch_size, len(pmids))}...', file=sys.stderr)
efetch_url = self.base_url + 'efetch.fcgi'
params = {
'db': 'pubmed',
'id': ','.join(batch),
'retmode': 'xml',
'rettype': 'abstract'
}
if self.email:
params['email'] = self.email
if self.api_key:
params['api_key'] = self.api_key
try:
response = self.session.get(efetch_url, params=params, timeout=60)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
articles = root.findall('.//PubmedArticle')
for article in articles:
metadata = self._extract_metadata_from_xml(article)
if metadata:
metadata_list.append(metadata)
# Rate limiting
time.sleep(self.delay)
except Exception as e:
print(f'Error fetching metadata for batch: {e}', file=sys.stderr)
continue
return metadata_list
def _extract_metadata_from_xml(self, article: ET.Element) -> Optional[Dict]:
"""Extract metadata from PubmedArticle XML element."""
try:
medline_citation = article.find('.//MedlineCitation')
article_elem = medline_citation.find('.//Article')
journal = article_elem.find('.//Journal')
# Get PMID
pmid = medline_citation.findtext('.//PMID', '')
# Get DOI
doi = None
article_ids = article.findall('.//ArticleId')
for article_id in article_ids:
if article_id.get('IdType') == 'doi':
doi = article_id.text
break
# Get authors
authors = []
author_list = article_elem.find('.//AuthorList')
if author_list is not None:
for author in author_list.findall('.//Author'):
last_name = author.findtext('.//LastName', '')
fore_name = author.findtext('.//ForeName', '')
if last_name:
if fore_name:
authors.append(f'{last_name}, {fore_name}')
else:
authors.append(last_name)
# Get year
year = article_elem.findtext('.//Journal/JournalIssue/PubDate/Year', '')
if not year:
medline_date = article_elem.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
if medline_date:
import re
year_match = re.search(r'\d{4}', medline_date)
if year_match:
year = year_match.group()
metadata = {
'pmid': pmid,
'doi': doi,
'title': article_elem.findtext('.//ArticleTitle', ''),
'authors': ' and '.join(authors),
'journal': journal.findtext('.//Title', ''),
'year': year,
'volume': journal.findtext('.//JournalIssue/Volume', ''),
'issue': journal.findtext('.//JournalIssue/Issue', ''),
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
'abstract': article_elem.findtext('.//Abstract/AbstractText', '')
}
return metadata
except Exception as e:
print(f'Error extracting metadata: {e}', file=sys.stderr)
return None
def metadata_to_bibtex(self, metadata: Dict) -> str:
"""Convert metadata to BibTeX format."""
# Generate citation key
if metadata.get('authors'):
first_author = metadata['authors'].split(' and ')[0]
if ',' in first_author:
last_name = first_author.split(',')[0].strip()
else:
last_name = first_author.split()[0]
else:
last_name = 'Unknown'
year = metadata.get('year', 'XXXX')
citation_key = f'{last_name}{year}pmid{metadata.get("pmid", "")}'
# Build BibTeX entry
lines = [f'@article{{{citation_key},']
if metadata.get('authors'):
lines.append(f' author = {{{metadata["authors"]}}},')
if metadata.get('title'):
lines.append(f' title = {{{metadata["title"]}}},')
if metadata.get('journal'):
lines.append(f' journal = {{{metadata["journal"]}}},')
if metadata.get('year'):
lines.append(f' year = {{{metadata["year"]}}},')
if metadata.get('volume'):
lines.append(f' volume = {{{metadata["volume"]}}},')
if metadata.get('issue'):
lines.append(f' number = {{{metadata["issue"]}}},')
if metadata.get('pages'):
pages = metadata['pages'].replace('-', '--')
lines.append(f' pages = {{{pages}}},')
if metadata.get('doi'):
lines.append(f' doi = {{{metadata["doi"]}}},')
if metadata.get('pmid'):
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
# Remove trailing comma
if lines[-1].endswith(','):
lines[-1] = lines[-1][:-1]
lines.append('}')
return '\n'.join(lines)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Search PubMed using E-utilities API',
epilog='Example: python search_pubmed.py "CRISPR gene editing" --limit 100'
)
parser.add_argument(
'query',
nargs='?',
help='Search query (PubMed syntax)'
)
parser.add_argument(
'--query',
dest='query_arg',
help='Search query (alternative to positional argument)'
)
parser.add_argument(
'--query-file',
help='File containing search query'
)
parser.add_argument(
'--limit',
type=int,
default=100,
help='Maximum number of results (default: 100)'
)
parser.add_argument(
'--date-start',
help='Start date (YYYY/MM/DD or YYYY)'
)
parser.add_argument(
'--date-end',
help='End date (YYYY/MM/DD or YYYY)'
)
parser.add_argument(
'--publication-types',
help='Comma-separated publication types (e.g., "Review,Clinical Trial")'
)
parser.add_argument(
'-o', '--output',
help='Output file (default: stdout)'
)
parser.add_argument(
'--format',
choices=['json', 'bibtex'],
default='json',
help='Output format (default: json)'
)
parser.add_argument(
'--api-key',
help='NCBI API key (or set NCBI_API_KEY env var)'
)
parser.add_argument(
'--email',
help='Email for Entrez (or set NCBI_EMAIL env var)'
)
args = parser.parse_args()
# Get query
query = args.query or args.query_arg
if args.query_file:
try:
with open(args.query_file, 'r', encoding='utf-8') as f:
query = f.read().strip()
except Exception as e:
print(f'Error reading query file: {e}', file=sys.stderr)
sys.exit(1)
if not query:
parser.print_help()
sys.exit(1)
# Parse publication types
pub_types = None
if args.publication_types:
pub_types = [pt.strip() for pt in args.publication_types.split(',')]
# Search PubMed
searcher = PubMedSearcher(api_key=args.api_key, email=args.email)
pmids = searcher.search(
query,
max_results=args.limit,
date_start=args.date_start,
date_end=args.date_end,
publication_types=pub_types
)
if not pmids:
print('No results found', file=sys.stderr)
sys.exit(1)
# Fetch metadata
metadata_list = searcher.fetch_metadata(pmids)
# Format output
if args.format == 'json':
output = json.dumps({
'query': query,
'count': len(metadata_list),
'results': metadata_list
}, indent=2)
else: # bibtex
bibtex_entries = [searcher.metadata_to_bibtex(m) for m in metadata_list]
output = '\n\n'.join(bibtex_entries) + '\n'
# Write output
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f'Wrote {len(metadata_list)} results to {args.output}', file=sys.stderr)
else:
print(output)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,497 @@
#!/usr/bin/env python3
"""
Citation Validation Tool
Validate BibTeX files for accuracy, completeness, and format compliance.
"""
import sys
import re
import requests
import argparse
import json
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
class CitationValidator:
"""Validate BibTeX entries for errors and inconsistencies."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'CitationValidator/1.0 (Citation Management Tool)'
})
# Required fields by entry type
self.required_fields = {
'article': ['author', 'title', 'journal', 'year'],
'book': ['title', 'publisher', 'year'], # author OR editor
'inproceedings': ['author', 'title', 'booktitle', 'year'],
'incollection': ['author', 'title', 'booktitle', 'publisher', 'year'],
'phdthesis': ['author', 'title', 'school', 'year'],
'mastersthesis': ['author', 'title', 'school', 'year'],
'techreport': ['author', 'title', 'institution', 'year'],
'misc': ['title', 'year']
}
# Recommended fields
self.recommended_fields = {
'article': ['volume', 'pages', 'doi'],
'book': ['isbn'],
'inproceedings': ['pages'],
}
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
"""
Parse BibTeX file and extract entries.
Args:
filepath: Path to BibTeX file
Returns:
List of entry dictionaries
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f'Error reading file: {e}', file=sys.stderr)
return []
entries = []
# Match BibTeX entries
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
for match in matches:
entry_type = match.group(1).lower()
citation_key = match.group(2).strip()
fields_text = match.group(3)
# Parse fields
fields = {}
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
field_matches = re.finditer(field_pattern, fields_text)
for field_match in field_matches:
if field_match.group(1):
field_name = field_match.group(1).lower()
field_value = field_match.group(2)
else:
field_name = field_match.group(3).lower()
field_value = field_match.group(4)
fields[field_name] = field_value.strip()
entries.append({
'type': entry_type,
'key': citation_key,
'fields': fields,
'raw': match.group(0)
})
return entries
def validate_entry(self, entry: Dict) -> Tuple[List[Dict], List[Dict]]:
"""
Validate a single BibTeX entry.
Args:
entry: Entry dictionary
Returns:
Tuple of (errors, warnings)
"""
errors = []
warnings = []
entry_type = entry['type']
key = entry['key']
fields = entry['fields']
# Check required fields
if entry_type in self.required_fields:
for req_field in self.required_fields[entry_type]:
if req_field not in fields or not fields[req_field]:
# Special case: book can have author OR editor
if entry_type == 'book' and req_field == 'author':
if 'editor' not in fields or not fields['editor']:
errors.append({
'type': 'missing_required_field',
'field': 'author or editor',
'severity': 'high',
'message': f'Entry {key}: Missing required field "author" or "editor"'
})
else:
errors.append({
'type': 'missing_required_field',
'field': req_field,
'severity': 'high',
'message': f'Entry {key}: Missing required field "{req_field}"'
})
# Check recommended fields
if entry_type in self.recommended_fields:
for rec_field in self.recommended_fields[entry_type]:
if rec_field not in fields or not fields[rec_field]:
warnings.append({
'type': 'missing_recommended_field',
'field': rec_field,
'severity': 'medium',
'message': f'Entry {key}: Missing recommended field "{rec_field}"'
})
# Validate year
if 'year' in fields:
year = fields['year']
if not re.match(r'^\d{4}$', year):
errors.append({
'type': 'invalid_year',
'field': 'year',
'value': year,
'severity': 'high',
'message': f'Entry {key}: Invalid year format "{year}" (should be 4 digits)'
})
elif int(year) < 1600 or int(year) > 2030:
warnings.append({
'type': 'suspicious_year',
'field': 'year',
'value': year,
'severity': 'medium',
'message': f'Entry {key}: Suspicious year "{year}" (outside reasonable range)'
})
# Validate DOI format
if 'doi' in fields:
doi = fields['doi']
if not re.match(r'^10\.\d{4,}/[^\s]+$', doi):
warnings.append({
'type': 'invalid_doi_format',
'field': 'doi',
'value': doi,
'severity': 'medium',
'message': f'Entry {key}: Invalid DOI format "{doi}"'
})
# Check for single hyphen in pages (should be --)
if 'pages' in fields:
pages = fields['pages']
if re.search(r'\d-\d', pages) and '--' not in pages:
warnings.append({
'type': 'page_range_format',
'field': 'pages',
'value': pages,
'severity': 'low',
'message': f'Entry {key}: Page range uses single hyphen, should use -- (en-dash)'
})
# Check author format
if 'author' in fields:
author = fields['author']
if ';' in author or '&' in author:
errors.append({
'type': 'invalid_author_format',
'field': 'author',
'severity': 'high',
'message': f'Entry {key}: Authors should be separated by " and ", not ";" or "&"'
})
return errors, warnings
def verify_doi(self, doi: str) -> Tuple[bool, Optional[Dict]]:
"""
Verify DOI resolves correctly and get metadata.
Args:
doi: Digital Object Identifier
Returns:
Tuple of (is_valid, metadata)
"""
try:
url = f'https://doi.org/{doi}'
response = self.session.head(url, timeout=10, allow_redirects=True)
if response.status_code < 400:
# DOI resolves, now get metadata from CrossRef
crossref_url = f'https://api.crossref.org/works/{doi}'
metadata_response = self.session.get(crossref_url, timeout=10)
if metadata_response.status_code == 200:
data = metadata_response.json()
message = data.get('message', {})
# Extract key metadata
metadata = {
'title': message.get('title', [''])[0],
'year': self._extract_year_crossref(message),
'authors': self._format_authors_crossref(message.get('author', [])),
}
return True, metadata
else:
return True, None # DOI resolves but no CrossRef metadata
else:
return False, None
except Exception:
return False, None
def detect_duplicates(self, entries: List[Dict]) -> List[Dict]:
"""
Detect duplicate entries.
Args:
entries: List of entry dictionaries
Returns:
List of duplicate groups
"""
duplicates = []
# Check for duplicate DOIs
doi_map = defaultdict(list)
for entry in entries:
doi = entry['fields'].get('doi', '').strip()
if doi:
doi_map[doi].append(entry['key'])
for doi, keys in doi_map.items():
if len(keys) > 1:
duplicates.append({
'type': 'duplicate_doi',
'doi': doi,
'entries': keys,
'severity': 'high',
'message': f'Duplicate DOI {doi} found in entries: {", ".join(keys)}'
})
# Check for duplicate citation keys
key_counts = defaultdict(int)
for entry in entries:
key_counts[entry['key']] += 1
for key, count in key_counts.items():
if count > 1:
duplicates.append({
'type': 'duplicate_key',
'key': key,
'count': count,
'severity': 'high',
'message': f'Citation key "{key}" appears {count} times'
})
# Check for similar titles (possible duplicates)
titles = {}
for entry in entries:
title = entry['fields'].get('title', '').lower()
title = re.sub(r'[^\w\s]', '', title) # Remove punctuation
title = ' '.join(title.split()) # Normalize whitespace
if title:
if title in titles:
duplicates.append({
'type': 'similar_title',
'entries': [titles[title], entry['key']],
'severity': 'medium',
'message': f'Possible duplicate: "{titles[title]}" and "{entry["key"]}" have identical titles'
})
else:
titles[title] = entry['key']
return duplicates
def validate_file(self, filepath: str, check_dois: bool = False) -> Dict:
"""
Validate entire BibTeX file.
Args:
filepath: Path to BibTeX file
check_dois: Whether to verify DOIs (slow)
Returns:
Validation report dictionary
"""
print(f'Parsing {filepath}...', file=sys.stderr)
entries = self.parse_bibtex_file(filepath)
if not entries:
return {
'total_entries': 0,
'errors': [],
'warnings': [],
'duplicates': []
}
print(f'Found {len(entries)} entries', file=sys.stderr)
all_errors = []
all_warnings = []
# Validate each entry
for i, entry in enumerate(entries):
print(f'Validating entry {i+1}/{len(entries)}: {entry["key"]}', file=sys.stderr)
errors, warnings = self.validate_entry(entry)
for error in errors:
error['entry'] = entry['key']
all_errors.append(error)
for warning in warnings:
warning['entry'] = entry['key']
all_warnings.append(warning)
# Check for duplicates
print('Checking for duplicates...', file=sys.stderr)
duplicates = self.detect_duplicates(entries)
# Verify DOIs if requested
doi_errors = []
if check_dois:
print('Verifying DOIs...', file=sys.stderr)
for i, entry in enumerate(entries):
doi = entry['fields'].get('doi', '')
if doi:
print(f'Verifying DOI {i+1}: {doi}', file=sys.stderr)
is_valid, metadata = self.verify_doi(doi)
if not is_valid:
doi_errors.append({
'type': 'invalid_doi',
'entry': entry['key'],
'doi': doi,
'severity': 'high',
'message': f'Entry {entry["key"]}: DOI does not resolve: {doi}'
})
all_errors.extend(doi_errors)
return {
'filepath': filepath,
'total_entries': len(entries),
'valid_entries': len(entries) - len([e for e in all_errors if e['severity'] == 'high']),
'errors': all_errors,
'warnings': all_warnings,
'duplicates': duplicates
}
def _extract_year_crossref(self, message: Dict) -> str:
"""Extract year from CrossRef message."""
date_parts = message.get('published-print', {}).get('date-parts', [[]])
if not date_parts or not date_parts[0]:
date_parts = message.get('published-online', {}).get('date-parts', [[]])
if date_parts and date_parts[0]:
return str(date_parts[0][0])
return ''
def _format_authors_crossref(self, authors: List[Dict]) -> str:
"""Format author list from CrossRef."""
if not authors:
return ''
formatted = []
for author in authors[:3]: # First 3 authors
given = author.get('given', '')
family = author.get('family', '')
if family:
formatted.append(f'{family}, {given}' if given else family)
if len(authors) > 3:
formatted.append('et al.')
return ', '.join(formatted)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Validate BibTeX files for errors and inconsistencies',
epilog='Example: python validate_citations.py references.bib'
)
parser.add_argument(
'file',
help='BibTeX file to validate'
)
parser.add_argument(
'--check-dois',
action='store_true',
help='Verify DOIs resolve correctly (slow)'
)
parser.add_argument(
'--auto-fix',
action='store_true',
help='Attempt to auto-fix common issues (not implemented yet)'
)
parser.add_argument(
'--report',
help='Output file for JSON validation report'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show detailed output'
)
args = parser.parse_args()
# Validate file
validator = CitationValidator()
report = validator.validate_file(args.file, check_dois=args.check_dois)
# Print summary
print('\n' + '='*60)
print('CITATION VALIDATION REPORT')
print('='*60)
print(f'\nFile: {args.file}')
print(f'Total entries: {report["total_entries"]}')
print(f'Valid entries: {report["valid_entries"]}')
print(f'Errors: {len(report["errors"])}')
print(f'Warnings: {len(report["warnings"])}')
print(f'Duplicates: {len(report["duplicates"])}')
# Print errors
if report['errors']:
print('\n' + '-'*60)
print('ERRORS (must fix):')
print('-'*60)
for error in report['errors']:
print(f'\n{error["message"]}')
if args.verbose:
print(f' Type: {error["type"]}')
print(f' Severity: {error["severity"]}')
# Print warnings
if report['warnings'] and args.verbose:
print('\n' + '-'*60)
print('WARNINGS (should fix):')
print('-'*60)
for warning in report['warnings']:
print(f'\n{warning["message"]}')
# Print duplicates
if report['duplicates']:
print('\n' + '-'*60)
print('DUPLICATES:')
print('-'*60)
for dup in report['duplicates']:
print(f'\n{dup["message"]}')
# Save report
if args.report:
with open(args.report, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2)
print(f'\nDetailed report saved to: {args.report}')
# Exit with error code if there are errors
if report['errors']:
sys.exit(1)
if __name__ == '__main__':
main()