350 lines
11 KiB
Python
Executable File
350 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
BibTeX Formatter and Cleaner
|
|
Format, clean, sort, and deduplicate BibTeX files.
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import argparse
|
|
from typing import List, Dict, Tuple
|
|
from collections import OrderedDict
|
|
|
|
class BibTeXFormatter:
|
|
"""Format and clean BibTeX entries."""
|
|
|
|
def __init__(self):
|
|
# Standard field order for readability
|
|
self.field_order = [
|
|
'author', 'editor', 'title', 'booktitle', 'journal',
|
|
'year', 'month', 'volume', 'number', 'pages',
|
|
'publisher', 'address', 'edition', 'series',
|
|
'school', 'institution', 'organization',
|
|
'howpublished', 'doi', 'url', 'isbn', 'issn',
|
|
'note', 'abstract', 'keywords'
|
|
]
|
|
|
|
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
|
|
"""
|
|
Parse BibTeX file and extract entries.
|
|
|
|
Args:
|
|
filepath: Path to BibTeX file
|
|
|
|
Returns:
|
|
List of entry dictionaries
|
|
"""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
print(f'Error reading file: {e}', file=sys.stderr)
|
|
return []
|
|
|
|
entries = []
|
|
|
|
# Match BibTeX entries
|
|
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
|
|
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
|
|
|
|
for match in matches:
|
|
entry_type = match.group(1).lower()
|
|
citation_key = match.group(2).strip()
|
|
fields_text = match.group(3)
|
|
|
|
# Parse fields
|
|
fields = OrderedDict()
|
|
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
|
|
field_matches = re.finditer(field_pattern, fields_text)
|
|
|
|
for field_match in field_matches:
|
|
if field_match.group(1):
|
|
field_name = field_match.group(1).lower()
|
|
field_value = field_match.group(2)
|
|
else:
|
|
field_name = field_match.group(3).lower()
|
|
field_value = field_match.group(4)
|
|
|
|
fields[field_name] = field_value.strip()
|
|
|
|
entries.append({
|
|
'type': entry_type,
|
|
'key': citation_key,
|
|
'fields': fields
|
|
})
|
|
|
|
return entries
|
|
|
|
def format_entry(self, entry: Dict) -> str:
|
|
"""
|
|
Format a single BibTeX entry.
|
|
|
|
Args:
|
|
entry: Entry dictionary
|
|
|
|
Returns:
|
|
Formatted BibTeX string
|
|
"""
|
|
lines = [f'@{entry["type"]}{{{entry["key"]},']
|
|
|
|
# Order fields according to standard order
|
|
ordered_fields = OrderedDict()
|
|
|
|
# Add fields in standard order
|
|
for field_name in self.field_order:
|
|
if field_name in entry['fields']:
|
|
ordered_fields[field_name] = entry['fields'][field_name]
|
|
|
|
# Add any remaining fields
|
|
for field_name, field_value in entry['fields'].items():
|
|
if field_name not in ordered_fields:
|
|
ordered_fields[field_name] = field_value
|
|
|
|
# Format each field
|
|
max_field_len = max(len(f) for f in ordered_fields.keys()) if ordered_fields else 0
|
|
|
|
for field_name, field_value in ordered_fields.items():
|
|
# Pad field name for alignment
|
|
padded_field = field_name.ljust(max_field_len)
|
|
lines.append(f' {padded_field} = {{{field_value}}},')
|
|
|
|
# Remove trailing comma from last field
|
|
if lines[-1].endswith(','):
|
|
lines[-1] = lines[-1][:-1]
|
|
|
|
lines.append('}')
|
|
|
|
return '\n'.join(lines)
|
|
|
|
def fix_common_issues(self, entry: Dict) -> Dict:
|
|
"""
|
|
Fix common formatting issues in entry.
|
|
|
|
Args:
|
|
entry: Entry dictionary
|
|
|
|
Returns:
|
|
Fixed entry dictionary
|
|
"""
|
|
fixed = entry.copy()
|
|
fields = fixed['fields'].copy()
|
|
|
|
# Fix page ranges (single hyphen to double hyphen)
|
|
if 'pages' in fields:
|
|
pages = fields['pages']
|
|
# Replace single hyphen with double hyphen if it's a range
|
|
if re.search(r'\d-\d', pages) and '--' not in pages:
|
|
pages = re.sub(r'(\d)-(\d)', r'\1--\2', pages)
|
|
fields['pages'] = pages
|
|
|
|
# Remove "pp." from pages
|
|
if 'pages' in fields:
|
|
pages = fields['pages']
|
|
pages = re.sub(r'^pp\.\s*', '', pages, flags=re.IGNORECASE)
|
|
fields['pages'] = pages
|
|
|
|
# Fix DOI (remove URL prefix if present)
|
|
if 'doi' in fields:
|
|
doi = fields['doi']
|
|
doi = doi.replace('https://doi.org/', '')
|
|
doi = doi.replace('http://doi.org/', '')
|
|
doi = doi.replace('doi:', '')
|
|
fields['doi'] = doi
|
|
|
|
# Fix author separators (semicolon or ampersand to 'and')
|
|
if 'author' in fields:
|
|
author = fields['author']
|
|
author = author.replace(';', ' and')
|
|
author = author.replace(' & ', ' and ')
|
|
# Clean up multiple 'and's
|
|
author = re.sub(r'\s+and\s+and\s+', ' and ', author)
|
|
fields['author'] = author
|
|
|
|
fixed['fields'] = fields
|
|
return fixed
|
|
|
|
def deduplicate_entries(self, entries: List[Dict]) -> List[Dict]:
|
|
"""
|
|
Remove duplicate entries based on DOI or citation key.
|
|
|
|
Args:
|
|
entries: List of entry dictionaries
|
|
|
|
Returns:
|
|
List of unique entries
|
|
"""
|
|
seen_dois = set()
|
|
seen_keys = set()
|
|
unique_entries = []
|
|
|
|
for entry in entries:
|
|
doi = entry['fields'].get('doi', '').strip()
|
|
key = entry['key']
|
|
|
|
# Check DOI first (more reliable)
|
|
if doi:
|
|
if doi in seen_dois:
|
|
print(f'Duplicate DOI found: {doi} (skipping {key})', file=sys.stderr)
|
|
continue
|
|
seen_dois.add(doi)
|
|
|
|
# Check citation key
|
|
if key in seen_keys:
|
|
print(f'Duplicate citation key found: {key} (skipping)', file=sys.stderr)
|
|
continue
|
|
seen_keys.add(key)
|
|
|
|
unique_entries.append(entry)
|
|
|
|
return unique_entries
|
|
|
|
def sort_entries(self, entries: List[Dict], sort_by: str = 'key', descending: bool = False) -> List[Dict]:
|
|
"""
|
|
Sort entries by specified field.
|
|
|
|
Args:
|
|
entries: List of entry dictionaries
|
|
sort_by: Field to sort by ('key', 'year', 'author', 'title')
|
|
descending: Sort in descending order
|
|
|
|
Returns:
|
|
Sorted list of entries
|
|
"""
|
|
def get_sort_key(entry: Dict) -> str:
|
|
if sort_by == 'key':
|
|
return entry['key'].lower()
|
|
elif sort_by == 'year':
|
|
year = entry['fields'].get('year', '9999')
|
|
return year
|
|
elif sort_by == 'author':
|
|
author = entry['fields'].get('author', 'ZZZ')
|
|
# Get last name of first author
|
|
if ',' in author:
|
|
return author.split(',')[0].lower()
|
|
else:
|
|
return author.split()[0].lower() if author else 'zzz'
|
|
elif sort_by == 'title':
|
|
return entry['fields'].get('title', '').lower()
|
|
else:
|
|
return entry['key'].lower()
|
|
|
|
return sorted(entries, key=get_sort_key, reverse=descending)
|
|
|
|
def format_file(self, filepath: str, output: str = None,
|
|
deduplicate: bool = False, sort_by: str = None,
|
|
descending: bool = False, fix_issues: bool = True) -> None:
|
|
"""
|
|
Format entire BibTeX file.
|
|
|
|
Args:
|
|
filepath: Input BibTeX file
|
|
output: Output file (None for in-place)
|
|
deduplicate: Remove duplicates
|
|
sort_by: Field to sort by
|
|
descending: Sort in descending order
|
|
fix_issues: Fix common formatting issues
|
|
"""
|
|
print(f'Parsing {filepath}...', file=sys.stderr)
|
|
entries = self.parse_bibtex_file(filepath)
|
|
|
|
if not entries:
|
|
print('No entries found', file=sys.stderr)
|
|
return
|
|
|
|
print(f'Found {len(entries)} entries', file=sys.stderr)
|
|
|
|
# Fix common issues
|
|
if fix_issues:
|
|
print('Fixing common issues...', file=sys.stderr)
|
|
entries = [self.fix_common_issues(e) for e in entries]
|
|
|
|
# Deduplicate
|
|
if deduplicate:
|
|
print('Removing duplicates...', file=sys.stderr)
|
|
original_count = len(entries)
|
|
entries = self.deduplicate_entries(entries)
|
|
removed = original_count - len(entries)
|
|
if removed > 0:
|
|
print(f'Removed {removed} duplicate(s)', file=sys.stderr)
|
|
|
|
# Sort
|
|
if sort_by:
|
|
print(f'Sorting by {sort_by}...', file=sys.stderr)
|
|
entries = self.sort_entries(entries, sort_by, descending)
|
|
|
|
# Format entries
|
|
print('Formatting entries...', file=sys.stderr)
|
|
formatted_entries = [self.format_entry(e) for e in entries]
|
|
|
|
# Write output
|
|
output_content = '\n\n'.join(formatted_entries) + '\n'
|
|
|
|
output_file = output or filepath
|
|
try:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(output_content)
|
|
print(f'Successfully wrote {len(entries)} entries to {output_file}', file=sys.stderr)
|
|
except Exception as e:
|
|
print(f'Error writing file: {e}', file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
def main():
|
|
"""Command-line interface."""
|
|
parser = argparse.ArgumentParser(
|
|
description='Format, clean, sort, and deduplicate BibTeX files',
|
|
epilog='Example: python format_bibtex.py references.bib --deduplicate --sort year'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'file',
|
|
help='BibTeX file to format'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-o', '--output',
|
|
help='Output file (default: overwrite input file)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--deduplicate',
|
|
action='store_true',
|
|
help='Remove duplicate entries'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--sort',
|
|
choices=['key', 'year', 'author', 'title'],
|
|
help='Sort entries by field'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--descending',
|
|
action='store_true',
|
|
help='Sort in descending order'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--no-fix',
|
|
action='store_true',
|
|
help='Do not fix common issues'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Format file
|
|
formatter = BibTeXFormatter()
|
|
formatter.format_file(
|
|
args.file,
|
|
output=args.output,
|
|
deduplicate=args.deduplicate,
|
|
sort_by=args.sort,
|
|
descending=args.descending,
|
|
fix_issues=not args.no_fix
|
|
)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|