Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions

View File

@@ -0,0 +1,497 @@
#!/usr/bin/env python3
"""
Citation Validation Tool
Validate BibTeX files for accuracy, completeness, and format compliance.
"""
import sys
import re
import requests
import argparse
import json
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
class CitationValidator:
"""Validate BibTeX entries for errors and inconsistencies."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'CitationValidator/1.0 (Citation Management Tool)'
})
# Required fields by entry type
self.required_fields = {
'article': ['author', 'title', 'journal', 'year'],
'book': ['title', 'publisher', 'year'], # author OR editor
'inproceedings': ['author', 'title', 'booktitle', 'year'],
'incollection': ['author', 'title', 'booktitle', 'publisher', 'year'],
'phdthesis': ['author', 'title', 'school', 'year'],
'mastersthesis': ['author', 'title', 'school', 'year'],
'techreport': ['author', 'title', 'institution', 'year'],
'misc': ['title', 'year']
}
# Recommended fields
self.recommended_fields = {
'article': ['volume', 'pages', 'doi'],
'book': ['isbn'],
'inproceedings': ['pages'],
}
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
"""
Parse BibTeX file and extract entries.
Args:
filepath: Path to BibTeX file
Returns:
List of entry dictionaries
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f'Error reading file: {e}', file=sys.stderr)
return []
entries = []
# Match BibTeX entries
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
for match in matches:
entry_type = match.group(1).lower()
citation_key = match.group(2).strip()
fields_text = match.group(3)
# Parse fields
fields = {}
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
field_matches = re.finditer(field_pattern, fields_text)
for field_match in field_matches:
if field_match.group(1):
field_name = field_match.group(1).lower()
field_value = field_match.group(2)
else:
field_name = field_match.group(3).lower()
field_value = field_match.group(4)
fields[field_name] = field_value.strip()
entries.append({
'type': entry_type,
'key': citation_key,
'fields': fields,
'raw': match.group(0)
})
return entries
def validate_entry(self, entry: Dict) -> Tuple[List[Dict], List[Dict]]:
"""
Validate a single BibTeX entry.
Args:
entry: Entry dictionary
Returns:
Tuple of (errors, warnings)
"""
errors = []
warnings = []
entry_type = entry['type']
key = entry['key']
fields = entry['fields']
# Check required fields
if entry_type in self.required_fields:
for req_field in self.required_fields[entry_type]:
if req_field not in fields or not fields[req_field]:
# Special case: book can have author OR editor
if entry_type == 'book' and req_field == 'author':
if 'editor' not in fields or not fields['editor']:
errors.append({
'type': 'missing_required_field',
'field': 'author or editor',
'severity': 'high',
'message': f'Entry {key}: Missing required field "author" or "editor"'
})
else:
errors.append({
'type': 'missing_required_field',
'field': req_field,
'severity': 'high',
'message': f'Entry {key}: Missing required field "{req_field}"'
})
# Check recommended fields
if entry_type in self.recommended_fields:
for rec_field in self.recommended_fields[entry_type]:
if rec_field not in fields or not fields[rec_field]:
warnings.append({
'type': 'missing_recommended_field',
'field': rec_field,
'severity': 'medium',
'message': f'Entry {key}: Missing recommended field "{rec_field}"'
})
# Validate year
if 'year' in fields:
year = fields['year']
if not re.match(r'^\d{4}$', year):
errors.append({
'type': 'invalid_year',
'field': 'year',
'value': year,
'severity': 'high',
'message': f'Entry {key}: Invalid year format "{year}" (should be 4 digits)'
})
elif int(year) < 1600 or int(year) > 2030:
warnings.append({
'type': 'suspicious_year',
'field': 'year',
'value': year,
'severity': 'medium',
'message': f'Entry {key}: Suspicious year "{year}" (outside reasonable range)'
})
# Validate DOI format
if 'doi' in fields:
doi = fields['doi']
if not re.match(r'^10\.\d{4,}/[^\s]+$', doi):
warnings.append({
'type': 'invalid_doi_format',
'field': 'doi',
'value': doi,
'severity': 'medium',
'message': f'Entry {key}: Invalid DOI format "{doi}"'
})
# Check for single hyphen in pages (should be --)
if 'pages' in fields:
pages = fields['pages']
if re.search(r'\d-\d', pages) and '--' not in pages:
warnings.append({
'type': 'page_range_format',
'field': 'pages',
'value': pages,
'severity': 'low',
'message': f'Entry {key}: Page range uses single hyphen, should use -- (en-dash)'
})
# Check author format
if 'author' in fields:
author = fields['author']
if ';' in author or '&' in author:
errors.append({
'type': 'invalid_author_format',
'field': 'author',
'severity': 'high',
'message': f'Entry {key}: Authors should be separated by " and ", not ";" or "&"'
})
return errors, warnings
def verify_doi(self, doi: str) -> Tuple[bool, Optional[Dict]]:
"""
Verify DOI resolves correctly and get metadata.
Args:
doi: Digital Object Identifier
Returns:
Tuple of (is_valid, metadata)
"""
try:
url = f'https://doi.org/{doi}'
response = self.session.head(url, timeout=10, allow_redirects=True)
if response.status_code < 400:
# DOI resolves, now get metadata from CrossRef
crossref_url = f'https://api.crossref.org/works/{doi}'
metadata_response = self.session.get(crossref_url, timeout=10)
if metadata_response.status_code == 200:
data = metadata_response.json()
message = data.get('message', {})
# Extract key metadata
metadata = {
'title': message.get('title', [''])[0],
'year': self._extract_year_crossref(message),
'authors': self._format_authors_crossref(message.get('author', [])),
}
return True, metadata
else:
return True, None # DOI resolves but no CrossRef metadata
else:
return False, None
except Exception:
return False, None
def detect_duplicates(self, entries: List[Dict]) -> List[Dict]:
"""
Detect duplicate entries.
Args:
entries: List of entry dictionaries
Returns:
List of duplicate groups
"""
duplicates = []
# Check for duplicate DOIs
doi_map = defaultdict(list)
for entry in entries:
doi = entry['fields'].get('doi', '').strip()
if doi:
doi_map[doi].append(entry['key'])
for doi, keys in doi_map.items():
if len(keys) > 1:
duplicates.append({
'type': 'duplicate_doi',
'doi': doi,
'entries': keys,
'severity': 'high',
'message': f'Duplicate DOI {doi} found in entries: {", ".join(keys)}'
})
# Check for duplicate citation keys
key_counts = defaultdict(int)
for entry in entries:
key_counts[entry['key']] += 1
for key, count in key_counts.items():
if count > 1:
duplicates.append({
'type': 'duplicate_key',
'key': key,
'count': count,
'severity': 'high',
'message': f'Citation key "{key}" appears {count} times'
})
# Check for similar titles (possible duplicates)
titles = {}
for entry in entries:
title = entry['fields'].get('title', '').lower()
title = re.sub(r'[^\w\s]', '', title) # Remove punctuation
title = ' '.join(title.split()) # Normalize whitespace
if title:
if title in titles:
duplicates.append({
'type': 'similar_title',
'entries': [titles[title], entry['key']],
'severity': 'medium',
'message': f'Possible duplicate: "{titles[title]}" and "{entry["key"]}" have identical titles'
})
else:
titles[title] = entry['key']
return duplicates
def validate_file(self, filepath: str, check_dois: bool = False) -> Dict:
"""
Validate entire BibTeX file.
Args:
filepath: Path to BibTeX file
check_dois: Whether to verify DOIs (slow)
Returns:
Validation report dictionary
"""
print(f'Parsing {filepath}...', file=sys.stderr)
entries = self.parse_bibtex_file(filepath)
if not entries:
return {
'total_entries': 0,
'errors': [],
'warnings': [],
'duplicates': []
}
print(f'Found {len(entries)} entries', file=sys.stderr)
all_errors = []
all_warnings = []
# Validate each entry
for i, entry in enumerate(entries):
print(f'Validating entry {i+1}/{len(entries)}: {entry["key"]}', file=sys.stderr)
errors, warnings = self.validate_entry(entry)
for error in errors:
error['entry'] = entry['key']
all_errors.append(error)
for warning in warnings:
warning['entry'] = entry['key']
all_warnings.append(warning)
# Check for duplicates
print('Checking for duplicates...', file=sys.stderr)
duplicates = self.detect_duplicates(entries)
# Verify DOIs if requested
doi_errors = []
if check_dois:
print('Verifying DOIs...', file=sys.stderr)
for i, entry in enumerate(entries):
doi = entry['fields'].get('doi', '')
if doi:
print(f'Verifying DOI {i+1}: {doi}', file=sys.stderr)
is_valid, metadata = self.verify_doi(doi)
if not is_valid:
doi_errors.append({
'type': 'invalid_doi',
'entry': entry['key'],
'doi': doi,
'severity': 'high',
'message': f'Entry {entry["key"]}: DOI does not resolve: {doi}'
})
all_errors.extend(doi_errors)
return {
'filepath': filepath,
'total_entries': len(entries),
'valid_entries': len(entries) - len([e for e in all_errors if e['severity'] == 'high']),
'errors': all_errors,
'warnings': all_warnings,
'duplicates': duplicates
}
def _extract_year_crossref(self, message: Dict) -> str:
"""Extract year from CrossRef message."""
date_parts = message.get('published-print', {}).get('date-parts', [[]])
if not date_parts or not date_parts[0]:
date_parts = message.get('published-online', {}).get('date-parts', [[]])
if date_parts and date_parts[0]:
return str(date_parts[0][0])
return ''
def _format_authors_crossref(self, authors: List[Dict]) -> str:
"""Format author list from CrossRef."""
if not authors:
return ''
formatted = []
for author in authors[:3]: # First 3 authors
given = author.get('given', '')
family = author.get('family', '')
if family:
formatted.append(f'{family}, {given}' if given else family)
if len(authors) > 3:
formatted.append('et al.')
return ', '.join(formatted)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Validate BibTeX files for errors and inconsistencies',
epilog='Example: python validate_citations.py references.bib'
)
parser.add_argument(
'file',
help='BibTeX file to validate'
)
parser.add_argument(
'--check-dois',
action='store_true',
help='Verify DOIs resolve correctly (slow)'
)
parser.add_argument(
'--auto-fix',
action='store_true',
help='Attempt to auto-fix common issues (not implemented yet)'
)
parser.add_argument(
'--report',
help='Output file for JSON validation report'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show detailed output'
)
args = parser.parse_args()
# Validate file
validator = CitationValidator()
report = validator.validate_file(args.file, check_dois=args.check_dois)
# Print summary
print('\n' + '='*60)
print('CITATION VALIDATION REPORT')
print('='*60)
print(f'\nFile: {args.file}')
print(f'Total entries: {report["total_entries"]}')
print(f'Valid entries: {report["valid_entries"]}')
print(f'Errors: {len(report["errors"])}')
print(f'Warnings: {len(report["warnings"])}')
print(f'Duplicates: {len(report["duplicates"])}')
# Print errors
if report['errors']:
print('\n' + '-'*60)
print('ERRORS (must fix):')
print('-'*60)
for error in report['errors']:
print(f'\n{error["message"]}')
if args.verbose:
print(f' Type: {error["type"]}')
print(f' Severity: {error["severity"]}')
# Print warnings
if report['warnings'] and args.verbose:
print('\n' + '-'*60)
print('WARNINGS (should fix):')
print('-'*60)
for warning in report['warnings']:
print(f'\n{warning["message"]}')
# Print duplicates
if report['duplicates']:
print('\n' + '-'*60)
print('DUPLICATES:')
print('-'*60)
for dup in report['duplicates']:
print(f'\n{dup["message"]}')
# Save report
if args.report:
with open(args.report, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2)
print(f'\nDetailed report saved to: {args.report}')
# Exit with error code if there are errors
if report['errors']:
sys.exit(1)
if __name__ == '__main__':
main()