gh-brunoasm-my-claude-skill…/skills/extract_from_pdfs/scripts/01_organize_metadata.py

#!/usr/bin/env python3
"""
Organize PDFs and metadata from various sources (BibTeX, RIS, directory, DOI list).
Standardizes file naming and creates a unified metadata JSON for downstream processing.
"""

import argparse
import json
import shutil
from pathlib import Path
from typing import Dict, List, Optional
import re

try:
    from pybtex.database.input import bibtex
    BIBTEX_AVAILABLE = True
except ImportError:
    BIBTEX_AVAILABLE = False
    print("Warning: pybtex not installed. BibTeX support disabled.")

try:
    import rispy
    RIS_AVAILABLE = True
except ImportError:
    RIS_AVAILABLE = False
    print("Warning: rispy not installed. RIS support disabled.")


def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description='Organize PDFs and metadata from various sources'
    )
    parser.add_argument(
        '--source-type',
        choices=['bibtex', 'ris', 'directory', 'doi_list'],
        required=True,
        help='Type of source data'
    )
    parser.add_argument(
        '--source',
        required=True,
        help='Path to source file (BibTeX/RIS file, directory, or DOI list)'
    )
    parser.add_argument(
        '--pdf-dir',
        help='Directory containing PDFs (for bibtex/ris with relative paths)'
    )
    parser.add_argument(
        '--output',
        default='metadata.json',
        help='Output metadata JSON file'
    )
    parser.add_argument(
        '--organize-pdfs',
        action='store_true',
        help='Copy PDFs to standardized directory structure'
    )
    parser.add_argument(
        '--pdf-output-dir',
        default='organized_pdfs',
        help='Directory for organized PDFs'
    )
    return parser.parse_args()


def load_bibtex_metadata(bib_path: Path, pdf_base_dir: Optional[Path] = None) -> List[Dict]:
    """Load metadata from BibTeX file"""
    if not BIBTEX_AVAILABLE:
        raise ImportError("pybtex is required for BibTeX support. Install with: pip install pybtex")

    parser = bibtex.Parser()
    bib_data = parser.parse_file(str(bib_path))

    metadata = []
    for key, entry in bib_data.entries.items():
        record = {
            'id': key,
            'type': entry.type,
            'title': entry.fields.get('title', ''),
            'year': entry.fields.get('year', ''),
            'doi': entry.fields.get('doi', ''),
            'abstract': entry.fields.get('abstract', ''),
            'journal': entry.fields.get('journal', ''),
            'authors': ', '.join(
                [' '.join([p for p in person.last_names + person.first_names])
                 for person in entry.persons.get('author', [])]
            ),
            'keywords': entry.fields.get('keywords', ''),
            'pdf_path': None
        }

        # Extract PDF path from file field
        if 'file' in entry.fields:
            file_field = entry.fields['file']
            if file_field.startswith('{') and file_field.endswith('}'):
                file_field = file_field[1:-1]

            for file_entry in file_field.split(';'):
                parts = file_entry.strip().split(':')
                if len(parts) >= 3 and parts[2].lower() == 'application/pdf':
                    pdf_path = parts[1].strip()
                    if pdf_base_dir:
                        pdf_path = str(pdf_base_dir / pdf_path)
                    record['pdf_path'] = pdf_path
                    break

        metadata.append(record)

    print(f"Loaded {len(metadata)} entries from BibTeX file")
    return metadata


def load_ris_metadata(ris_path: Path, pdf_base_dir: Optional[Path] = None) -> List[Dict]:
    """Load metadata from RIS file"""
    if not RIS_AVAILABLE:
        raise ImportError("rispy is required for RIS support. Install with: pip install rispy")

    with open(ris_path, 'r', encoding='utf-8') as f:
        entries = rispy.load(f)

    metadata = []
    for i, entry in enumerate(entries):
        # Generate ID from first author and year or use index
        first_author = entry.get('authors', [None])[0] or 'Unknown'
        year = entry.get('year', 'NoYear')
        entry_id = f"{first_author.split()[-1]}{year}_{i}"

        record = {
            'id': entry_id,
            'type': entry.get('type_of_reference', 'article'),
            'title': entry.get('title', ''),
            'year': str(entry.get('year', '')),
            'doi': entry.get('doi', ''),
            'abstract': entry.get('abstract', ''),
            'journal': entry.get('journal_name', ''),
            'authors': '; '.join(entry.get('authors', [])),
            'keywords': '; '.join(entry.get('keywords', [])),
            'pdf_path': None
        }

        # Try to find PDF in standard locations
        if pdf_base_dir:
            # Common patterns: FirstAuthorYear.pdf, doi_cleaned.pdf, etc.
            pdf_candidates = [
                f"{entry_id}.pdf",
                f"{first_author.split()[-1]}_{year}.pdf"
            ]
            if record['doi']:
                safe_doi = re.sub(r'[^\w\-_]', '_', record['doi'])
                pdf_candidates.append(f"{safe_doi}.pdf")

            for candidate in pdf_candidates:
                pdf_path = pdf_base_dir / candidate
                if pdf_path.exists():
                    record['pdf_path'] = str(pdf_path)
                    break

        metadata.append(record)

    print(f"Loaded {len(metadata)} entries from RIS file")
    return metadata


def load_directory_metadata(dir_path: Path) -> List[Dict]:
    """Load metadata by scanning directory for PDFs"""
    pdf_files = list(dir_path.glob('**/*.pdf'))

    metadata = []
    for pdf_path in pdf_files:
        # Generate ID from filename
        entry_id = pdf_path.stem

        record = {
            'id': entry_id,
            'type': 'article',
            'title': entry_id.replace('_', ' '),
            'year': '',
            'doi': '',
            'abstract': '',
            'journal': '',
            'authors': '',
            'keywords': '',
            'pdf_path': str(pdf_path)
        }

        # Try to extract DOI from filename if present
        doi_match = re.search(r'10\.\d{4,}/[^\s]+', entry_id)
        if doi_match:
            record['doi'] = doi_match.group(0)

        metadata.append(record)

    print(f"Found {len(metadata)} PDFs in directory")
    return metadata


def load_doi_list_metadata(doi_list_path: Path) -> List[Dict]:
    """Load metadata from a list of DOIs (will need to fetch metadata separately)"""
    with open(doi_list_path, 'r') as f:
        dois = [line.strip() for line in f if line.strip()]

    metadata = []
    for doi in dois:
        safe_doi = re.sub(r'[^\w\-_]', '_', doi)
        record = {
            'id': safe_doi,
            'type': 'article',
            'title': '',
            'year': '',
            'doi': doi,
            'abstract': '',
            'journal': '',
            'authors': '',
            'keywords': '',
            'pdf_path': None
        }
        metadata.append(record)

    print(f"Loaded {len(metadata)} DOIs")
    print("Note: You'll need to fetch full metadata and PDFs separately")
    return metadata


def organize_pdfs(metadata: List[Dict], output_dir: Path) -> List[Dict]:
    """Copy and rename PDFs to standardized directory structure"""
    output_dir.mkdir(parents=True, exist_ok=True)

    organized_metadata = []
    stats = {'copied': 0, 'missing': 0, 'total': len(metadata)}

    for record in metadata:
        if record['pdf_path'] and Path(record['pdf_path']).exists():
            source_path = Path(record['pdf_path'])
            dest_path = output_dir / f"{record['id']}.pdf"

            try:
                shutil.copy2(source_path, dest_path)
                record['pdf_path'] = str(dest_path)
                stats['copied'] += 1
            except Exception as e:
                print(f"Error copying {source_path}: {e}")
                stats['missing'] += 1
        else:
            if record['pdf_path']:
                print(f"PDF not found: {record['pdf_path']}")
            stats['missing'] += 1

        organized_metadata.append(record)

    print(f"\nPDF Organization Summary:")
    print(f"  Total entries: {stats['total']}")
    print(f"  PDFs copied: {stats['copied']}")
    print(f"  PDFs missing: {stats['missing']}")

    return organized_metadata


def save_metadata(metadata: List[Dict], output_path: Path):
    """Save metadata to JSON file"""
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)

    print(f"\nMetadata saved to: {output_path}")


def main():
    args = parse_args()

    source_path = Path(args.source)
    pdf_base_dir = Path(args.pdf_dir) if args.pdf_dir else None
    output_path = Path(args.output)

    # Load metadata based on source type
    if args.source_type == 'bibtex':
        metadata = load_bibtex_metadata(source_path, pdf_base_dir)
    elif args.source_type == 'ris':
        metadata = load_ris_metadata(source_path, pdf_base_dir)
    elif args.source_type == 'directory':
        metadata = load_directory_metadata(source_path)
    elif args.source_type == 'doi_list':
        metadata = load_doi_list_metadata(source_path)
    else:
        raise ValueError(f"Unknown source type: {args.source_type}")

    # Organize PDFs if requested
    if args.organize_pdfs:
        pdf_output_dir = Path(args.pdf_output_dir)
        metadata = organize_pdfs(metadata, pdf_output_dir)

    # Save metadata
    save_metadata(metadata, output_path)

    # Print summary statistics
    total = len(metadata)
    with_pdfs = sum(1 for r in metadata if r['pdf_path'])
    with_abstracts = sum(1 for r in metadata if r['abstract'])
    with_dois = sum(1 for r in metadata if r['doi'])

    print(f"\nMetadata Summary:")
    print(f"  Total entries: {total}")
    print(f"  With PDFs: {with_pdfs}")
    print(f"  With abstracts: {with_abstracts}")
    print(f"  With DOIs: {with_dois}")


if __name__ == '__main__':
    main()