Initial commit

2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions
--- a/skills/citation-management/scripts/validate_citations.py
+++ b/skills/citation-management/scripts/validate_citations.py
@@ -0,0 +1,497 @@
+#!/usr/bin/env python3
+"""
+Citation Validation Tool
+Validate BibTeX files for accuracy, completeness, and format compliance.
+"""
+
+import sys
+import re
+import requests
+import argparse
+import json
+from typing import Dict, List, Tuple, Optional
+from collections import defaultdict
+
+class CitationValidator:
+    """Validate BibTeX entries for errors and inconsistencies."""
+    
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'CitationValidator/1.0 (Citation Management Tool)'
+        })
+        
+        # Required fields by entry type
+        self.required_fields = {
+            'article': ['author', 'title', 'journal', 'year'],
+            'book': ['title', 'publisher', 'year'],  # author OR editor
+            'inproceedings': ['author', 'title', 'booktitle', 'year'],
+            'incollection': ['author', 'title', 'booktitle', 'publisher', 'year'],
+            'phdthesis': ['author', 'title', 'school', 'year'],
+            'mastersthesis': ['author', 'title', 'school', 'year'],
+            'techreport': ['author', 'title', 'institution', 'year'],
+            'misc': ['title', 'year']
+        }
+        
+        # Recommended fields
+        self.recommended_fields = {
+            'article': ['volume', 'pages', 'doi'],
+            'book': ['isbn'],
+            'inproceedings': ['pages'],
+        }
+    
+    def parse_bibtex_file(self, filepath: str) -> List[Dict]:
+        """
+        Parse BibTeX file and extract entries.
+        
+        Args:
+            filepath: Path to BibTeX file
+            
+        Returns:
+            List of entry dictionaries
+        """
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                content = f.read()
+        except Exception as e:
+            print(f'Error reading file: {e}', file=sys.stderr)
+            return []
+        
+        entries = []
+        
+        # Match BibTeX entries
+        pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
+        matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
+        
+        for match in matches:
+            entry_type = match.group(1).lower()
+            citation_key = match.group(2).strip()
+            fields_text = match.group(3)
+            
+            # Parse fields
+            fields = {}
+            field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
+            field_matches = re.finditer(field_pattern, fields_text)
+            
+            for field_match in field_matches:
+                if field_match.group(1):
+                    field_name = field_match.group(1).lower()
+                    field_value = field_match.group(2)
+                else:
+                    field_name = field_match.group(3).lower()
+                    field_value = field_match.group(4)
+                
+                fields[field_name] = field_value.strip()
+            
+            entries.append({
+                'type': entry_type,
+                'key': citation_key,
+                'fields': fields,
+                'raw': match.group(0)
+            })
+        
+        return entries
+    
+    def validate_entry(self, entry: Dict) -> Tuple[List[Dict], List[Dict]]:
+        """
+        Validate a single BibTeX entry.
+        
+        Args:
+            entry: Entry dictionary
+            
+        Returns:
+            Tuple of (errors, warnings)
+        """
+        errors = []
+        warnings = []
+        
+        entry_type = entry['type']
+        key = entry['key']
+        fields = entry['fields']
+        
+        # Check required fields
+        if entry_type in self.required_fields:
+            for req_field in self.required_fields[entry_type]:
+                if req_field not in fields or not fields[req_field]:
+                    # Special case: book can have author OR editor
+                    if entry_type == 'book' and req_field == 'author':
+                        if 'editor' not in fields or not fields['editor']:
+                            errors.append({
+                                'type': 'missing_required_field',
+                                'field': 'author or editor',
+                                'severity': 'high',
+                                'message': f'Entry {key}: Missing required field "author" or "editor"'
+                            })
+                    else:
+                        errors.append({
+                            'type': 'missing_required_field',
+                            'field': req_field,
+                            'severity': 'high',
+                            'message': f'Entry {key}: Missing required field "{req_field}"'
+                        })
+        
+        # Check recommended fields
+        if entry_type in self.recommended_fields:
+            for rec_field in self.recommended_fields[entry_type]:
+                if rec_field not in fields or not fields[rec_field]:
+                    warnings.append({
+                        'type': 'missing_recommended_field',
+                        'field': rec_field,
+                        'severity': 'medium',
+                        'message': f'Entry {key}: Missing recommended field "{rec_field}"'
+                    })
+        
+        # Validate year
+        if 'year' in fields:
+            year = fields['year']
+            if not re.match(r'^\d{4}$', year):
+                errors.append({
+                    'type': 'invalid_year',
+                    'field': 'year',
+                    'value': year,
+                    'severity': 'high',
+                    'message': f'Entry {key}: Invalid year format "{year}" (should be 4 digits)'
+                })
+            elif int(year) < 1600 or int(year) > 2030:
+                warnings.append({
+                    'type': 'suspicious_year',
+                    'field': 'year',
+                    'value': year,
+                    'severity': 'medium',
+                    'message': f'Entry {key}: Suspicious year "{year}" (outside reasonable range)'
+                })
+        
+        # Validate DOI format
+        if 'doi' in fields:
+            doi = fields['doi']
+            if not re.match(r'^10\.\d{4,}/[^\s]+$', doi):
+                warnings.append({
+                    'type': 'invalid_doi_format',
+                    'field': 'doi',
+                    'value': doi,
+                    'severity': 'medium',
+                    'message': f'Entry {key}: Invalid DOI format "{doi}"'
+                })
+        
+        # Check for single hyphen in pages (should be --)
+        if 'pages' in fields:
+            pages = fields['pages']
+            if re.search(r'\d-\d', pages) and '--' not in pages:
+                warnings.append({
+                    'type': 'page_range_format',
+                    'field': 'pages',
+                    'value': pages,
+                    'severity': 'low',
+                    'message': f'Entry {key}: Page range uses single hyphen, should use -- (en-dash)'
+                })
+        
+        # Check author format
+        if 'author' in fields:
+            author = fields['author']
+            if ';' in author or '&' in author:
+                errors.append({
+                    'type': 'invalid_author_format',
+                    'field': 'author',
+                    'severity': 'high',
+                    'message': f'Entry {key}: Authors should be separated by " and ", not ";" or "&"'
+                })
+        
+        return errors, warnings
+    
+    def verify_doi(self, doi: str) -> Tuple[bool, Optional[Dict]]:
+        """
+        Verify DOI resolves correctly and get metadata.
+        
+        Args:
+            doi: Digital Object Identifier
+            
+        Returns:
+            Tuple of (is_valid, metadata)
+        """
+        try:
+            url = f'https://doi.org/{doi}'
+            response = self.session.head(url, timeout=10, allow_redirects=True)
+            
+            if response.status_code < 400:
+                # DOI resolves, now get metadata from CrossRef
+                crossref_url = f'https://api.crossref.org/works/{doi}'
+                metadata_response = self.session.get(crossref_url, timeout=10)
+                
+                if metadata_response.status_code == 200:
+                    data = metadata_response.json()
+                    message = data.get('message', {})
+                    
+                    # Extract key metadata
+                    metadata = {
+                        'title': message.get('title', [''])[0],
+                        'year': self._extract_year_crossref(message),
+                        'authors': self._format_authors_crossref(message.get('author', [])),
+                    }
+                    return True, metadata
+                else:
+                    return True, None  # DOI resolves but no CrossRef metadata
+            else:
+                return False, None
+                
+        except Exception:
+            return False, None
+    
+    def detect_duplicates(self, entries: List[Dict]) -> List[Dict]:
+        """
+        Detect duplicate entries.
+        
+        Args:
+            entries: List of entry dictionaries
+            
+        Returns:
+            List of duplicate groups
+        """
+        duplicates = []
+        
+        # Check for duplicate DOIs
+        doi_map = defaultdict(list)
+        for entry in entries:
+            doi = entry['fields'].get('doi', '').strip()
+            if doi:
+                doi_map[doi].append(entry['key'])
+        
+        for doi, keys in doi_map.items():
+            if len(keys) > 1:
+                duplicates.append({
+                    'type': 'duplicate_doi',
+                    'doi': doi,
+                    'entries': keys,
+                    'severity': 'high',
+                    'message': f'Duplicate DOI {doi} found in entries: {", ".join(keys)}'
+                })
+        
+        # Check for duplicate citation keys
+        key_counts = defaultdict(int)
+        for entry in entries:
+            key_counts[entry['key']] += 1
+        
+        for key, count in key_counts.items():
+            if count > 1:
+                duplicates.append({
+                    'type': 'duplicate_key',
+                    'key': key,
+                    'count': count,
+                    'severity': 'high',
+                    'message': f'Citation key "{key}" appears {count} times'
+                })
+        
+        # Check for similar titles (possible duplicates)
+        titles = {}
+        for entry in entries:
+            title = entry['fields'].get('title', '').lower()
+            title = re.sub(r'[^\w\s]', '', title)  # Remove punctuation
+            title = ' '.join(title.split())  # Normalize whitespace
+            
+            if title:
+                if title in titles:
+                    duplicates.append({
+                        'type': 'similar_title',
+                        'entries': [titles[title], entry['key']],
+                        'severity': 'medium',
+                        'message': f'Possible duplicate: "{titles[title]}" and "{entry["key"]}" have identical titles'
+                    })
+                else:
+                    titles[title] = entry['key']
+        
+        return duplicates
+    
+    def validate_file(self, filepath: str, check_dois: bool = False) -> Dict:
+        """
+        Validate entire BibTeX file.
+        
+        Args:
+            filepath: Path to BibTeX file
+            check_dois: Whether to verify DOIs (slow)
+            
+        Returns:
+            Validation report dictionary
+        """
+        print(f'Parsing {filepath}...', file=sys.stderr)
+        entries = self.parse_bibtex_file(filepath)
+        
+        if not entries:
+            return {
+                'total_entries': 0,
+                'errors': [],
+                'warnings': [],
+                'duplicates': []
+            }
+        
+        print(f'Found {len(entries)} entries', file=sys.stderr)
+        
+        all_errors = []
+        all_warnings = []
+        
+        # Validate each entry
+        for i, entry in enumerate(entries):
+            print(f'Validating entry {i+1}/{len(entries)}: {entry["key"]}', file=sys.stderr)
+            errors, warnings = self.validate_entry(entry)
+            
+            for error in errors:
+                error['entry'] = entry['key']
+                all_errors.append(error)
+            
+            for warning in warnings:
+                warning['entry'] = entry['key']
+                all_warnings.append(warning)
+        
+        # Check for duplicates
+        print('Checking for duplicates...', file=sys.stderr)
+        duplicates = self.detect_duplicates(entries)
+        
+        # Verify DOIs if requested
+        doi_errors = []
+        if check_dois:
+            print('Verifying DOIs...', file=sys.stderr)
+            for i, entry in enumerate(entries):
+                doi = entry['fields'].get('doi', '')
+                if doi:
+                    print(f'Verifying DOI {i+1}: {doi}', file=sys.stderr)
+                    is_valid, metadata = self.verify_doi(doi)
+                    
+                    if not is_valid:
+                        doi_errors.append({
+                            'type': 'invalid_doi',
+                            'entry': entry['key'],
+                            'doi': doi,
+                            'severity': 'high',
+                            'message': f'Entry {entry["key"]}: DOI does not resolve: {doi}'
+                        })
+        
+        all_errors.extend(doi_errors)
+        
+        return {
+            'filepath': filepath,
+            'total_entries': len(entries),
+            'valid_entries': len(entries) - len([e for e in all_errors if e['severity'] == 'high']),
+            'errors': all_errors,
+            'warnings': all_warnings,
+            'duplicates': duplicates
+        }
+    
+    def _extract_year_crossref(self, message: Dict) -> str:
+        """Extract year from CrossRef message."""
+        date_parts = message.get('published-print', {}).get('date-parts', [[]])
+        if not date_parts or not date_parts[0]:
+            date_parts = message.get('published-online', {}).get('date-parts', [[]])
+        
+        if date_parts and date_parts[0]:
+            return str(date_parts[0][0])
+        return ''
+    
+    def _format_authors_crossref(self, authors: List[Dict]) -> str:
+        """Format author list from CrossRef."""
+        if not authors:
+            return ''
+        
+        formatted = []
+        for author in authors[:3]:  # First 3 authors
+            given = author.get('given', '')
+            family = author.get('family', '')
+            if family:
+                formatted.append(f'{family}, {given}' if given else family)
+        
+        if len(authors) > 3:
+            formatted.append('et al.')
+        
+        return ', '.join(formatted)
+
+
+def main():
+    """Command-line interface."""
+    parser = argparse.ArgumentParser(
+        description='Validate BibTeX files for errors and inconsistencies',
+        epilog='Example: python validate_citations.py references.bib'
+    )
+    
+    parser.add_argument(
+        'file',
+        help='BibTeX file to validate'
+    )
+    
+    parser.add_argument(
+        '--check-dois',
+        action='store_true',
+        help='Verify DOIs resolve correctly (slow)'
+    )
+    
+    parser.add_argument(
+        '--auto-fix',
+        action='store_true',
+        help='Attempt to auto-fix common issues (not implemented yet)'
+    )
+    
+    parser.add_argument(
+        '--report',
+        help='Output file for JSON validation report'
+    )
+    
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Show detailed output'
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate file
+    validator = CitationValidator()
+    report = validator.validate_file(args.file, check_dois=args.check_dois)
+    
+    # Print summary
+    print('\n' + '='*60)
+    print('CITATION VALIDATION REPORT')
+    print('='*60)
+    print(f'\nFile: {args.file}')
+    print(f'Total entries: {report["total_entries"]}')
+    print(f'Valid entries: {report["valid_entries"]}')
+    print(f'Errors: {len(report["errors"])}')
+    print(f'Warnings: {len(report["warnings"])}')
+    print(f'Duplicates: {len(report["duplicates"])}')
+    
+    # Print errors
+    if report['errors']:
+        print('\n' + '-'*60)
+        print('ERRORS (must fix):')
+        print('-'*60)
+        for error in report['errors']:
+            print(f'\n{error["message"]}')
+            if args.verbose:
+                print(f'  Type: {error["type"]}')
+                print(f'  Severity: {error["severity"]}')
+    
+    # Print warnings
+    if report['warnings'] and args.verbose:
+        print('\n' + '-'*60)
+        print('WARNINGS (should fix):')
+        print('-'*60)
+        for warning in report['warnings']:
+            print(f'\n{warning["message"]}')
+    
+    # Print duplicates
+    if report['duplicates']:
+        print('\n' + '-'*60)
+        print('DUPLICATES:')
+        print('-'*60)
+        for dup in report['duplicates']:
+            print(f'\n{dup["message"]}')
+    
+    # Save report
+    if args.report:
+        with open(args.report, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2)
+        print(f'\nDetailed report saved to: {args.report}')
+    
+    # Exit with error code if there are errors
+    if report['errors']:
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
+