Initial commit
This commit is contained in:
349
skills/citation-management/scripts/format_bibtex.py
Executable file
349
skills/citation-management/scripts/format_bibtex.py
Executable file
@@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BibTeX Formatter and Cleaner
|
||||
Format, clean, sort, and deduplicate BibTeX files.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
from typing import List, Dict, Tuple
|
||||
from collections import OrderedDict
|
||||
|
||||
class BibTeXFormatter:
|
||||
"""Format and clean BibTeX entries."""
|
||||
|
||||
def __init__(self):
|
||||
# Standard field order for readability
|
||||
self.field_order = [
|
||||
'author', 'editor', 'title', 'booktitle', 'journal',
|
||||
'year', 'month', 'volume', 'number', 'pages',
|
||||
'publisher', 'address', 'edition', 'series',
|
||||
'school', 'institution', 'organization',
|
||||
'howpublished', 'doi', 'url', 'isbn', 'issn',
|
||||
'note', 'abstract', 'keywords'
|
||||
]
|
||||
|
||||
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
|
||||
"""
|
||||
Parse BibTeX file and extract entries.
|
||||
|
||||
Args:
|
||||
filepath: Path to BibTeX file
|
||||
|
||||
Returns:
|
||||
List of entry dictionaries
|
||||
"""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
print(f'Error reading file: {e}', file=sys.stderr)
|
||||
return []
|
||||
|
||||
entries = []
|
||||
|
||||
# Match BibTeX entries
|
||||
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
|
||||
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
entry_type = match.group(1).lower()
|
||||
citation_key = match.group(2).strip()
|
||||
fields_text = match.group(3)
|
||||
|
||||
# Parse fields
|
||||
fields = OrderedDict()
|
||||
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
|
||||
field_matches = re.finditer(field_pattern, fields_text)
|
||||
|
||||
for field_match in field_matches:
|
||||
if field_match.group(1):
|
||||
field_name = field_match.group(1).lower()
|
||||
field_value = field_match.group(2)
|
||||
else:
|
||||
field_name = field_match.group(3).lower()
|
||||
field_value = field_match.group(4)
|
||||
|
||||
fields[field_name] = field_value.strip()
|
||||
|
||||
entries.append({
|
||||
'type': entry_type,
|
||||
'key': citation_key,
|
||||
'fields': fields
|
||||
})
|
||||
|
||||
return entries
|
||||
|
||||
def format_entry(self, entry: Dict) -> str:
|
||||
"""
|
||||
Format a single BibTeX entry.
|
||||
|
||||
Args:
|
||||
entry: Entry dictionary
|
||||
|
||||
Returns:
|
||||
Formatted BibTeX string
|
||||
"""
|
||||
lines = [f'@{entry["type"]}{{{entry["key"]},']
|
||||
|
||||
# Order fields according to standard order
|
||||
ordered_fields = OrderedDict()
|
||||
|
||||
# Add fields in standard order
|
||||
for field_name in self.field_order:
|
||||
if field_name in entry['fields']:
|
||||
ordered_fields[field_name] = entry['fields'][field_name]
|
||||
|
||||
# Add any remaining fields
|
||||
for field_name, field_value in entry['fields'].items():
|
||||
if field_name not in ordered_fields:
|
||||
ordered_fields[field_name] = field_value
|
||||
|
||||
# Format each field
|
||||
max_field_len = max(len(f) for f in ordered_fields.keys()) if ordered_fields else 0
|
||||
|
||||
for field_name, field_value in ordered_fields.items():
|
||||
# Pad field name for alignment
|
||||
padded_field = field_name.ljust(max_field_len)
|
||||
lines.append(f' {padded_field} = {{{field_value}}},')
|
||||
|
||||
# Remove trailing comma from last field
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def fix_common_issues(self, entry: Dict) -> Dict:
|
||||
"""
|
||||
Fix common formatting issues in entry.
|
||||
|
||||
Args:
|
||||
entry: Entry dictionary
|
||||
|
||||
Returns:
|
||||
Fixed entry dictionary
|
||||
"""
|
||||
fixed = entry.copy()
|
||||
fields = fixed['fields'].copy()
|
||||
|
||||
# Fix page ranges (single hyphen to double hyphen)
|
||||
if 'pages' in fields:
|
||||
pages = fields['pages']
|
||||
# Replace single hyphen with double hyphen if it's a range
|
||||
if re.search(r'\d-\d', pages) and '--' not in pages:
|
||||
pages = re.sub(r'(\d)-(\d)', r'\1--\2', pages)
|
||||
fields['pages'] = pages
|
||||
|
||||
# Remove "pp." from pages
|
||||
if 'pages' in fields:
|
||||
pages = fields['pages']
|
||||
pages = re.sub(r'^pp\.\s*', '', pages, flags=re.IGNORECASE)
|
||||
fields['pages'] = pages
|
||||
|
||||
# Fix DOI (remove URL prefix if present)
|
||||
if 'doi' in fields:
|
||||
doi = fields['doi']
|
||||
doi = doi.replace('https://doi.org/', '')
|
||||
doi = doi.replace('http://doi.org/', '')
|
||||
doi = doi.replace('doi:', '')
|
||||
fields['doi'] = doi
|
||||
|
||||
# Fix author separators (semicolon or ampersand to 'and')
|
||||
if 'author' in fields:
|
||||
author = fields['author']
|
||||
author = author.replace(';', ' and')
|
||||
author = author.replace(' & ', ' and ')
|
||||
# Clean up multiple 'and's
|
||||
author = re.sub(r'\s+and\s+and\s+', ' and ', author)
|
||||
fields['author'] = author
|
||||
|
||||
fixed['fields'] = fields
|
||||
return fixed
|
||||
|
||||
def deduplicate_entries(self, entries: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Remove duplicate entries based on DOI or citation key.
|
||||
|
||||
Args:
|
||||
entries: List of entry dictionaries
|
||||
|
||||
Returns:
|
||||
List of unique entries
|
||||
"""
|
||||
seen_dois = set()
|
||||
seen_keys = set()
|
||||
unique_entries = []
|
||||
|
||||
for entry in entries:
|
||||
doi = entry['fields'].get('doi', '').strip()
|
||||
key = entry['key']
|
||||
|
||||
# Check DOI first (more reliable)
|
||||
if doi:
|
||||
if doi in seen_dois:
|
||||
print(f'Duplicate DOI found: {doi} (skipping {key})', file=sys.stderr)
|
||||
continue
|
||||
seen_dois.add(doi)
|
||||
|
||||
# Check citation key
|
||||
if key in seen_keys:
|
||||
print(f'Duplicate citation key found: {key} (skipping)', file=sys.stderr)
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
|
||||
unique_entries.append(entry)
|
||||
|
||||
return unique_entries
|
||||
|
||||
def sort_entries(self, entries: List[Dict], sort_by: str = 'key', descending: bool = False) -> List[Dict]:
|
||||
"""
|
||||
Sort entries by specified field.
|
||||
|
||||
Args:
|
||||
entries: List of entry dictionaries
|
||||
sort_by: Field to sort by ('key', 'year', 'author', 'title')
|
||||
descending: Sort in descending order
|
||||
|
||||
Returns:
|
||||
Sorted list of entries
|
||||
"""
|
||||
def get_sort_key(entry: Dict) -> str:
|
||||
if sort_by == 'key':
|
||||
return entry['key'].lower()
|
||||
elif sort_by == 'year':
|
||||
year = entry['fields'].get('year', '9999')
|
||||
return year
|
||||
elif sort_by == 'author':
|
||||
author = entry['fields'].get('author', 'ZZZ')
|
||||
# Get last name of first author
|
||||
if ',' in author:
|
||||
return author.split(',')[0].lower()
|
||||
else:
|
||||
return author.split()[0].lower() if author else 'zzz'
|
||||
elif sort_by == 'title':
|
||||
return entry['fields'].get('title', '').lower()
|
||||
else:
|
||||
return entry['key'].lower()
|
||||
|
||||
return sorted(entries, key=get_sort_key, reverse=descending)
|
||||
|
||||
def format_file(self, filepath: str, output: str = None,
|
||||
deduplicate: bool = False, sort_by: str = None,
|
||||
descending: bool = False, fix_issues: bool = True) -> None:
|
||||
"""
|
||||
Format entire BibTeX file.
|
||||
|
||||
Args:
|
||||
filepath: Input BibTeX file
|
||||
output: Output file (None for in-place)
|
||||
deduplicate: Remove duplicates
|
||||
sort_by: Field to sort by
|
||||
descending: Sort in descending order
|
||||
fix_issues: Fix common formatting issues
|
||||
"""
|
||||
print(f'Parsing {filepath}...', file=sys.stderr)
|
||||
entries = self.parse_bibtex_file(filepath)
|
||||
|
||||
if not entries:
|
||||
print('No entries found', file=sys.stderr)
|
||||
return
|
||||
|
||||
print(f'Found {len(entries)} entries', file=sys.stderr)
|
||||
|
||||
# Fix common issues
|
||||
if fix_issues:
|
||||
print('Fixing common issues...', file=sys.stderr)
|
||||
entries = [self.fix_common_issues(e) for e in entries]
|
||||
|
||||
# Deduplicate
|
||||
if deduplicate:
|
||||
print('Removing duplicates...', file=sys.stderr)
|
||||
original_count = len(entries)
|
||||
entries = self.deduplicate_entries(entries)
|
||||
removed = original_count - len(entries)
|
||||
if removed > 0:
|
||||
print(f'Removed {removed} duplicate(s)', file=sys.stderr)
|
||||
|
||||
# Sort
|
||||
if sort_by:
|
||||
print(f'Sorting by {sort_by}...', file=sys.stderr)
|
||||
entries = self.sort_entries(entries, sort_by, descending)
|
||||
|
||||
# Format entries
|
||||
print('Formatting entries...', file=sys.stderr)
|
||||
formatted_entries = [self.format_entry(e) for e in entries]
|
||||
|
||||
# Write output
|
||||
output_content = '\n\n'.join(formatted_entries) + '\n'
|
||||
|
||||
output_file = output or filepath
|
||||
try:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(output_content)
|
||||
print(f'Successfully wrote {len(entries)} entries to {output_file}', file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f'Error writing file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Format, clean, sort, and deduplicate BibTeX files',
|
||||
epilog='Example: python format_bibtex.py references.bib --deduplicate --sort year'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'file',
|
||||
help='BibTeX file to format'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file (default: overwrite input file)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--deduplicate',
|
||||
action='store_true',
|
||||
help='Remove duplicate entries'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--sort',
|
||||
choices=['key', 'year', 'author', 'title'],
|
||||
help='Sort entries by field'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--descending',
|
||||
action='store_true',
|
||||
help='Sort in descending order'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-fix',
|
||||
action='store_true',
|
||||
help='Do not fix common issues'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Format file
|
||||
formatter = BibTeXFormatter()
|
||||
formatter.format_file(
|
||||
args.file,
|
||||
output=args.output,
|
||||
deduplicate=args.deduplicate,
|
||||
sort_by=args.sort,
|
||||
descending=args.descending,
|
||||
fix_issues=not args.no_fix
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user