498 lines
17 KiB
Python
Executable File
498 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Citation Validation Tool
|
|
Validate BibTeX files for accuracy, completeness, and format compliance.
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import requests
|
|
import argparse
|
|
import json
|
|
from typing import Dict, List, Tuple, Optional
|
|
from collections import defaultdict
|
|
|
|
class CitationValidator:
|
|
"""Validate BibTeX entries for errors and inconsistencies."""
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'CitationValidator/1.0 (Citation Management Tool)'
|
|
})
|
|
|
|
# Required fields by entry type
|
|
self.required_fields = {
|
|
'article': ['author', 'title', 'journal', 'year'],
|
|
'book': ['title', 'publisher', 'year'], # author OR editor
|
|
'inproceedings': ['author', 'title', 'booktitle', 'year'],
|
|
'incollection': ['author', 'title', 'booktitle', 'publisher', 'year'],
|
|
'phdthesis': ['author', 'title', 'school', 'year'],
|
|
'mastersthesis': ['author', 'title', 'school', 'year'],
|
|
'techreport': ['author', 'title', 'institution', 'year'],
|
|
'misc': ['title', 'year']
|
|
}
|
|
|
|
# Recommended fields
|
|
self.recommended_fields = {
|
|
'article': ['volume', 'pages', 'doi'],
|
|
'book': ['isbn'],
|
|
'inproceedings': ['pages'],
|
|
}
|
|
|
|
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
|
|
"""
|
|
Parse BibTeX file and extract entries.
|
|
|
|
Args:
|
|
filepath: Path to BibTeX file
|
|
|
|
Returns:
|
|
List of entry dictionaries
|
|
"""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
print(f'Error reading file: {e}', file=sys.stderr)
|
|
return []
|
|
|
|
entries = []
|
|
|
|
# Match BibTeX entries
|
|
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
|
|
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
|
|
|
|
for match in matches:
|
|
entry_type = match.group(1).lower()
|
|
citation_key = match.group(2).strip()
|
|
fields_text = match.group(3)
|
|
|
|
# Parse fields
|
|
fields = {}
|
|
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
|
|
field_matches = re.finditer(field_pattern, fields_text)
|
|
|
|
for field_match in field_matches:
|
|
if field_match.group(1):
|
|
field_name = field_match.group(1).lower()
|
|
field_value = field_match.group(2)
|
|
else:
|
|
field_name = field_match.group(3).lower()
|
|
field_value = field_match.group(4)
|
|
|
|
fields[field_name] = field_value.strip()
|
|
|
|
entries.append({
|
|
'type': entry_type,
|
|
'key': citation_key,
|
|
'fields': fields,
|
|
'raw': match.group(0)
|
|
})
|
|
|
|
return entries
|
|
|
|
def validate_entry(self, entry: Dict) -> Tuple[List[Dict], List[Dict]]:
|
|
"""
|
|
Validate a single BibTeX entry.
|
|
|
|
Args:
|
|
entry: Entry dictionary
|
|
|
|
Returns:
|
|
Tuple of (errors, warnings)
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
entry_type = entry['type']
|
|
key = entry['key']
|
|
fields = entry['fields']
|
|
|
|
# Check required fields
|
|
if entry_type in self.required_fields:
|
|
for req_field in self.required_fields[entry_type]:
|
|
if req_field not in fields or not fields[req_field]:
|
|
# Special case: book can have author OR editor
|
|
if entry_type == 'book' and req_field == 'author':
|
|
if 'editor' not in fields or not fields['editor']:
|
|
errors.append({
|
|
'type': 'missing_required_field',
|
|
'field': 'author or editor',
|
|
'severity': 'high',
|
|
'message': f'Entry {key}: Missing required field "author" or "editor"'
|
|
})
|
|
else:
|
|
errors.append({
|
|
'type': 'missing_required_field',
|
|
'field': req_field,
|
|
'severity': 'high',
|
|
'message': f'Entry {key}: Missing required field "{req_field}"'
|
|
})
|
|
|
|
# Check recommended fields
|
|
if entry_type in self.recommended_fields:
|
|
for rec_field in self.recommended_fields[entry_type]:
|
|
if rec_field not in fields or not fields[rec_field]:
|
|
warnings.append({
|
|
'type': 'missing_recommended_field',
|
|
'field': rec_field,
|
|
'severity': 'medium',
|
|
'message': f'Entry {key}: Missing recommended field "{rec_field}"'
|
|
})
|
|
|
|
# Validate year
|
|
if 'year' in fields:
|
|
year = fields['year']
|
|
if not re.match(r'^\d{4}$', year):
|
|
errors.append({
|
|
'type': 'invalid_year',
|
|
'field': 'year',
|
|
'value': year,
|
|
'severity': 'high',
|
|
'message': f'Entry {key}: Invalid year format "{year}" (should be 4 digits)'
|
|
})
|
|
elif int(year) < 1600 or int(year) > 2030:
|
|
warnings.append({
|
|
'type': 'suspicious_year',
|
|
'field': 'year',
|
|
'value': year,
|
|
'severity': 'medium',
|
|
'message': f'Entry {key}: Suspicious year "{year}" (outside reasonable range)'
|
|
})
|
|
|
|
# Validate DOI format
|
|
if 'doi' in fields:
|
|
doi = fields['doi']
|
|
if not re.match(r'^10\.\d{4,}/[^\s]+$', doi):
|
|
warnings.append({
|
|
'type': 'invalid_doi_format',
|
|
'field': 'doi',
|
|
'value': doi,
|
|
'severity': 'medium',
|
|
'message': f'Entry {key}: Invalid DOI format "{doi}"'
|
|
})
|
|
|
|
# Check for single hyphen in pages (should be --)
|
|
if 'pages' in fields:
|
|
pages = fields['pages']
|
|
if re.search(r'\d-\d', pages) and '--' not in pages:
|
|
warnings.append({
|
|
'type': 'page_range_format',
|
|
'field': 'pages',
|
|
'value': pages,
|
|
'severity': 'low',
|
|
'message': f'Entry {key}: Page range uses single hyphen, should use -- (en-dash)'
|
|
})
|
|
|
|
# Check author format
|
|
if 'author' in fields:
|
|
author = fields['author']
|
|
if ';' in author or '&' in author:
|
|
errors.append({
|
|
'type': 'invalid_author_format',
|
|
'field': 'author',
|
|
'severity': 'high',
|
|
'message': f'Entry {key}: Authors should be separated by " and ", not ";" or "&"'
|
|
})
|
|
|
|
return errors, warnings
|
|
|
|
def verify_doi(self, doi: str) -> Tuple[bool, Optional[Dict]]:
|
|
"""
|
|
Verify DOI resolves correctly and get metadata.
|
|
|
|
Args:
|
|
doi: Digital Object Identifier
|
|
|
|
Returns:
|
|
Tuple of (is_valid, metadata)
|
|
"""
|
|
try:
|
|
url = f'https://doi.org/{doi}'
|
|
response = self.session.head(url, timeout=10, allow_redirects=True)
|
|
|
|
if response.status_code < 400:
|
|
# DOI resolves, now get metadata from CrossRef
|
|
crossref_url = f'https://api.crossref.org/works/{doi}'
|
|
metadata_response = self.session.get(crossref_url, timeout=10)
|
|
|
|
if metadata_response.status_code == 200:
|
|
data = metadata_response.json()
|
|
message = data.get('message', {})
|
|
|
|
# Extract key metadata
|
|
metadata = {
|
|
'title': message.get('title', [''])[0],
|
|
'year': self._extract_year_crossref(message),
|
|
'authors': self._format_authors_crossref(message.get('author', [])),
|
|
}
|
|
return True, metadata
|
|
else:
|
|
return True, None # DOI resolves but no CrossRef metadata
|
|
else:
|
|
return False, None
|
|
|
|
except Exception:
|
|
return False, None
|
|
|
|
def detect_duplicates(self, entries: List[Dict]) -> List[Dict]:
|
|
"""
|
|
Detect duplicate entries.
|
|
|
|
Args:
|
|
entries: List of entry dictionaries
|
|
|
|
Returns:
|
|
List of duplicate groups
|
|
"""
|
|
duplicates = []
|
|
|
|
# Check for duplicate DOIs
|
|
doi_map = defaultdict(list)
|
|
for entry in entries:
|
|
doi = entry['fields'].get('doi', '').strip()
|
|
if doi:
|
|
doi_map[doi].append(entry['key'])
|
|
|
|
for doi, keys in doi_map.items():
|
|
if len(keys) > 1:
|
|
duplicates.append({
|
|
'type': 'duplicate_doi',
|
|
'doi': doi,
|
|
'entries': keys,
|
|
'severity': 'high',
|
|
'message': f'Duplicate DOI {doi} found in entries: {", ".join(keys)}'
|
|
})
|
|
|
|
# Check for duplicate citation keys
|
|
key_counts = defaultdict(int)
|
|
for entry in entries:
|
|
key_counts[entry['key']] += 1
|
|
|
|
for key, count in key_counts.items():
|
|
if count > 1:
|
|
duplicates.append({
|
|
'type': 'duplicate_key',
|
|
'key': key,
|
|
'count': count,
|
|
'severity': 'high',
|
|
'message': f'Citation key "{key}" appears {count} times'
|
|
})
|
|
|
|
# Check for similar titles (possible duplicates)
|
|
titles = {}
|
|
for entry in entries:
|
|
title = entry['fields'].get('title', '').lower()
|
|
title = re.sub(r'[^\w\s]', '', title) # Remove punctuation
|
|
title = ' '.join(title.split()) # Normalize whitespace
|
|
|
|
if title:
|
|
if title in titles:
|
|
duplicates.append({
|
|
'type': 'similar_title',
|
|
'entries': [titles[title], entry['key']],
|
|
'severity': 'medium',
|
|
'message': f'Possible duplicate: "{titles[title]}" and "{entry["key"]}" have identical titles'
|
|
})
|
|
else:
|
|
titles[title] = entry['key']
|
|
|
|
return duplicates
|
|
|
|
def validate_file(self, filepath: str, check_dois: bool = False) -> Dict:
|
|
"""
|
|
Validate entire BibTeX file.
|
|
|
|
Args:
|
|
filepath: Path to BibTeX file
|
|
check_dois: Whether to verify DOIs (slow)
|
|
|
|
Returns:
|
|
Validation report dictionary
|
|
"""
|
|
print(f'Parsing {filepath}...', file=sys.stderr)
|
|
entries = self.parse_bibtex_file(filepath)
|
|
|
|
if not entries:
|
|
return {
|
|
'total_entries': 0,
|
|
'errors': [],
|
|
'warnings': [],
|
|
'duplicates': []
|
|
}
|
|
|
|
print(f'Found {len(entries)} entries', file=sys.stderr)
|
|
|
|
all_errors = []
|
|
all_warnings = []
|
|
|
|
# Validate each entry
|
|
for i, entry in enumerate(entries):
|
|
print(f'Validating entry {i+1}/{len(entries)}: {entry["key"]}', file=sys.stderr)
|
|
errors, warnings = self.validate_entry(entry)
|
|
|
|
for error in errors:
|
|
error['entry'] = entry['key']
|
|
all_errors.append(error)
|
|
|
|
for warning in warnings:
|
|
warning['entry'] = entry['key']
|
|
all_warnings.append(warning)
|
|
|
|
# Check for duplicates
|
|
print('Checking for duplicates...', file=sys.stderr)
|
|
duplicates = self.detect_duplicates(entries)
|
|
|
|
# Verify DOIs if requested
|
|
doi_errors = []
|
|
if check_dois:
|
|
print('Verifying DOIs...', file=sys.stderr)
|
|
for i, entry in enumerate(entries):
|
|
doi = entry['fields'].get('doi', '')
|
|
if doi:
|
|
print(f'Verifying DOI {i+1}: {doi}', file=sys.stderr)
|
|
is_valid, metadata = self.verify_doi(doi)
|
|
|
|
if not is_valid:
|
|
doi_errors.append({
|
|
'type': 'invalid_doi',
|
|
'entry': entry['key'],
|
|
'doi': doi,
|
|
'severity': 'high',
|
|
'message': f'Entry {entry["key"]}: DOI does not resolve: {doi}'
|
|
})
|
|
|
|
all_errors.extend(doi_errors)
|
|
|
|
return {
|
|
'filepath': filepath,
|
|
'total_entries': len(entries),
|
|
'valid_entries': len(entries) - len([e for e in all_errors if e['severity'] == 'high']),
|
|
'errors': all_errors,
|
|
'warnings': all_warnings,
|
|
'duplicates': duplicates
|
|
}
|
|
|
|
def _extract_year_crossref(self, message: Dict) -> str:
|
|
"""Extract year from CrossRef message."""
|
|
date_parts = message.get('published-print', {}).get('date-parts', [[]])
|
|
if not date_parts or not date_parts[0]:
|
|
date_parts = message.get('published-online', {}).get('date-parts', [[]])
|
|
|
|
if date_parts and date_parts[0]:
|
|
return str(date_parts[0][0])
|
|
return ''
|
|
|
|
def _format_authors_crossref(self, authors: List[Dict]) -> str:
|
|
"""Format author list from CrossRef."""
|
|
if not authors:
|
|
return ''
|
|
|
|
formatted = []
|
|
for author in authors[:3]: # First 3 authors
|
|
given = author.get('given', '')
|
|
family = author.get('family', '')
|
|
if family:
|
|
formatted.append(f'{family}, {given}' if given else family)
|
|
|
|
if len(authors) > 3:
|
|
formatted.append('et al.')
|
|
|
|
return ', '.join(formatted)
|
|
|
|
|
|
def main():
|
|
"""Command-line interface."""
|
|
parser = argparse.ArgumentParser(
|
|
description='Validate BibTeX files for errors and inconsistencies',
|
|
epilog='Example: python validate_citations.py references.bib'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'file',
|
|
help='BibTeX file to validate'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--check-dois',
|
|
action='store_true',
|
|
help='Verify DOIs resolve correctly (slow)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--auto-fix',
|
|
action='store_true',
|
|
help='Attempt to auto-fix common issues (not implemented yet)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--report',
|
|
help='Output file for JSON validation report'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--verbose',
|
|
action='store_true',
|
|
help='Show detailed output'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate file
|
|
validator = CitationValidator()
|
|
report = validator.validate_file(args.file, check_dois=args.check_dois)
|
|
|
|
# Print summary
|
|
print('\n' + '='*60)
|
|
print('CITATION VALIDATION REPORT')
|
|
print('='*60)
|
|
print(f'\nFile: {args.file}')
|
|
print(f'Total entries: {report["total_entries"]}')
|
|
print(f'Valid entries: {report["valid_entries"]}')
|
|
print(f'Errors: {len(report["errors"])}')
|
|
print(f'Warnings: {len(report["warnings"])}')
|
|
print(f'Duplicates: {len(report["duplicates"])}')
|
|
|
|
# Print errors
|
|
if report['errors']:
|
|
print('\n' + '-'*60)
|
|
print('ERRORS (must fix):')
|
|
print('-'*60)
|
|
for error in report['errors']:
|
|
print(f'\n{error["message"]}')
|
|
if args.verbose:
|
|
print(f' Type: {error["type"]}')
|
|
print(f' Severity: {error["severity"]}')
|
|
|
|
# Print warnings
|
|
if report['warnings'] and args.verbose:
|
|
print('\n' + '-'*60)
|
|
print('WARNINGS (should fix):')
|
|
print('-'*60)
|
|
for warning in report['warnings']:
|
|
print(f'\n{warning["message"]}')
|
|
|
|
# Print duplicates
|
|
if report['duplicates']:
|
|
print('\n' + '-'*60)
|
|
print('DUPLICATES:')
|
|
print('-'*60)
|
|
for dup in report['duplicates']:
|
|
print(f'\n{dup["message"]}')
|
|
|
|
# Save report
|
|
if args.report:
|
|
with open(args.report, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2)
|
|
print(f'\nDetailed report saved to: {args.report}')
|
|
|
|
# Exit with error code if there are errors
|
|
if report['errors']:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|