#!/usr/bin/env python3 """ Organize PDFs and metadata from various sources (BibTeX, RIS, directory, DOI list). Standardizes file naming and creates a unified metadata JSON for downstream processing. """ import argparse import json import shutil from pathlib import Path from typing import Dict, List, Optional import re try: from pybtex.database.input import bibtex BIBTEX_AVAILABLE = True except ImportError: BIBTEX_AVAILABLE = False print("Warning: pybtex not installed. BibTeX support disabled.") try: import rispy RIS_AVAILABLE = True except ImportError: RIS_AVAILABLE = False print("Warning: rispy not installed. RIS support disabled.") def parse_args(): """Parse command line arguments""" parser = argparse.ArgumentParser( description='Organize PDFs and metadata from various sources' ) parser.add_argument( '--source-type', choices=['bibtex', 'ris', 'directory', 'doi_list'], required=True, help='Type of source data' ) parser.add_argument( '--source', required=True, help='Path to source file (BibTeX/RIS file, directory, or DOI list)' ) parser.add_argument( '--pdf-dir', help='Directory containing PDFs (for bibtex/ris with relative paths)' ) parser.add_argument( '--output', default='metadata.json', help='Output metadata JSON file' ) parser.add_argument( '--organize-pdfs', action='store_true', help='Copy PDFs to standardized directory structure' ) parser.add_argument( '--pdf-output-dir', default='organized_pdfs', help='Directory for organized PDFs' ) return parser.parse_args() def load_bibtex_metadata(bib_path: Path, pdf_base_dir: Optional[Path] = None) -> List[Dict]: """Load metadata from BibTeX file""" if not BIBTEX_AVAILABLE: raise ImportError("pybtex is required for BibTeX support. Install with: pip install pybtex") parser = bibtex.Parser() bib_data = parser.parse_file(str(bib_path)) metadata = [] for key, entry in bib_data.entries.items(): record = { 'id': key, 'type': entry.type, 'title': entry.fields.get('title', ''), 'year': entry.fields.get('year', ''), 'doi': entry.fields.get('doi', ''), 'abstract': entry.fields.get('abstract', ''), 'journal': entry.fields.get('journal', ''), 'authors': ', '.join( [' '.join([p for p in person.last_names + person.first_names]) for person in entry.persons.get('author', [])] ), 'keywords': entry.fields.get('keywords', ''), 'pdf_path': None } # Extract PDF path from file field if 'file' in entry.fields: file_field = entry.fields['file'] if file_field.startswith('{') and file_field.endswith('}'): file_field = file_field[1:-1] for file_entry in file_field.split(';'): parts = file_entry.strip().split(':') if len(parts) >= 3 and parts[2].lower() == 'application/pdf': pdf_path = parts[1].strip() if pdf_base_dir: pdf_path = str(pdf_base_dir / pdf_path) record['pdf_path'] = pdf_path break metadata.append(record) print(f"Loaded {len(metadata)} entries from BibTeX file") return metadata def load_ris_metadata(ris_path: Path, pdf_base_dir: Optional[Path] = None) -> List[Dict]: """Load metadata from RIS file""" if not RIS_AVAILABLE: raise ImportError("rispy is required for RIS support. Install with: pip install rispy") with open(ris_path, 'r', encoding='utf-8') as f: entries = rispy.load(f) metadata = [] for i, entry in enumerate(entries): # Generate ID from first author and year or use index first_author = entry.get('authors', [None])[0] or 'Unknown' year = entry.get('year', 'NoYear') entry_id = f"{first_author.split()[-1]}{year}_{i}" record = { 'id': entry_id, 'type': entry.get('type_of_reference', 'article'), 'title': entry.get('title', ''), 'year': str(entry.get('year', '')), 'doi': entry.get('doi', ''), 'abstract': entry.get('abstract', ''), 'journal': entry.get('journal_name', ''), 'authors': '; '.join(entry.get('authors', [])), 'keywords': '; '.join(entry.get('keywords', [])), 'pdf_path': None } # Try to find PDF in standard locations if pdf_base_dir: # Common patterns: FirstAuthorYear.pdf, doi_cleaned.pdf, etc. pdf_candidates = [ f"{entry_id}.pdf", f"{first_author.split()[-1]}_{year}.pdf" ] if record['doi']: safe_doi = re.sub(r'[^\w\-_]', '_', record['doi']) pdf_candidates.append(f"{safe_doi}.pdf") for candidate in pdf_candidates: pdf_path = pdf_base_dir / candidate if pdf_path.exists(): record['pdf_path'] = str(pdf_path) break metadata.append(record) print(f"Loaded {len(metadata)} entries from RIS file") return metadata def load_directory_metadata(dir_path: Path) -> List[Dict]: """Load metadata by scanning directory for PDFs""" pdf_files = list(dir_path.glob('**/*.pdf')) metadata = [] for pdf_path in pdf_files: # Generate ID from filename entry_id = pdf_path.stem record = { 'id': entry_id, 'type': 'article', 'title': entry_id.replace('_', ' '), 'year': '', 'doi': '', 'abstract': '', 'journal': '', 'authors': '', 'keywords': '', 'pdf_path': str(pdf_path) } # Try to extract DOI from filename if present doi_match = re.search(r'10\.\d{4,}/[^\s]+', entry_id) if doi_match: record['doi'] = doi_match.group(0) metadata.append(record) print(f"Found {len(metadata)} PDFs in directory") return metadata def load_doi_list_metadata(doi_list_path: Path) -> List[Dict]: """Load metadata from a list of DOIs (will need to fetch metadata separately)""" with open(doi_list_path, 'r') as f: dois = [line.strip() for line in f if line.strip()] metadata = [] for doi in dois: safe_doi = re.sub(r'[^\w\-_]', '_', doi) record = { 'id': safe_doi, 'type': 'article', 'title': '', 'year': '', 'doi': doi, 'abstract': '', 'journal': '', 'authors': '', 'keywords': '', 'pdf_path': None } metadata.append(record) print(f"Loaded {len(metadata)} DOIs") print("Note: You'll need to fetch full metadata and PDFs separately") return metadata def organize_pdfs(metadata: List[Dict], output_dir: Path) -> List[Dict]: """Copy and rename PDFs to standardized directory structure""" output_dir.mkdir(parents=True, exist_ok=True) organized_metadata = [] stats = {'copied': 0, 'missing': 0, 'total': len(metadata)} for record in metadata: if record['pdf_path'] and Path(record['pdf_path']).exists(): source_path = Path(record['pdf_path']) dest_path = output_dir / f"{record['id']}.pdf" try: shutil.copy2(source_path, dest_path) record['pdf_path'] = str(dest_path) stats['copied'] += 1 except Exception as e: print(f"Error copying {source_path}: {e}") stats['missing'] += 1 else: if record['pdf_path']: print(f"PDF not found: {record['pdf_path']}") stats['missing'] += 1 organized_metadata.append(record) print(f"\nPDF Organization Summary:") print(f" Total entries: {stats['total']}") print(f" PDFs copied: {stats['copied']}") print(f" PDFs missing: {stats['missing']}") return organized_metadata def save_metadata(metadata: List[Dict], output_path: Path): """Save metadata to JSON file""" output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(metadata, f, indent=2, ensure_ascii=False) print(f"\nMetadata saved to: {output_path}") def main(): args = parse_args() source_path = Path(args.source) pdf_base_dir = Path(args.pdf_dir) if args.pdf_dir else None output_path = Path(args.output) # Load metadata based on source type if args.source_type == 'bibtex': metadata = load_bibtex_metadata(source_path, pdf_base_dir) elif args.source_type == 'ris': metadata = load_ris_metadata(source_path, pdf_base_dir) elif args.source_type == 'directory': metadata = load_directory_metadata(source_path) elif args.source_type == 'doi_list': metadata = load_doi_list_metadata(source_path) else: raise ValueError(f"Unknown source type: {args.source_type}") # Organize PDFs if requested if args.organize_pdfs: pdf_output_dir = Path(args.pdf_output_dir) metadata = organize_pdfs(metadata, pdf_output_dir) # Save metadata save_metadata(metadata, output_path) # Print summary statistics total = len(metadata) with_pdfs = sum(1 for r in metadata if r['pdf_path']) with_abstracts = sum(1 for r in metadata if r['abstract']) with_dois = sum(1 for r in metadata if r['doi']) print(f"\nMetadata Summary:") print(f" Total entries: {total}") print(f" With PDFs: {with_pdfs}") print(f" With abstracts: {with_abstracts}") print(f" With DOIs: {with_dois}") if __name__ == '__main__': main()