284 lines
8.7 KiB
Python
Executable File
284 lines
8.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Convert scientific literature PDFs to Markdown for analysis and review.
|
|
|
|
This script is specifically designed for converting academic papers,
|
|
organizing them, and preparing them for literature review workflows.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
from markitdown import MarkItDown
|
|
from datetime import datetime
|
|
|
|
|
|
def extract_metadata_from_filename(filename: str) -> Dict[str, str]:
|
|
"""
|
|
Try to extract metadata from filename.
|
|
Supports patterns like: Author_Year_Title.pdf
|
|
"""
|
|
metadata = {}
|
|
|
|
# Remove extension
|
|
name = Path(filename).stem
|
|
|
|
# Try to extract year
|
|
year_match = re.search(r'\b(19|20)\d{2}\b', name)
|
|
if year_match:
|
|
metadata['year'] = year_match.group()
|
|
|
|
# Split by underscores or dashes
|
|
parts = re.split(r'[_\-]', name)
|
|
if len(parts) >= 2:
|
|
metadata['author'] = parts[0].replace('_', ' ')
|
|
metadata['title'] = ' '.join(parts[1:]).replace('_', ' ')
|
|
else:
|
|
metadata['title'] = name.replace('_', ' ')
|
|
|
|
return metadata
|
|
|
|
|
|
def convert_paper(
|
|
md: MarkItDown,
|
|
input_file: Path,
|
|
output_dir: Path,
|
|
organize_by_year: bool = False
|
|
) -> tuple[bool, Dict]:
|
|
"""
|
|
Convert a single paper to Markdown with metadata extraction.
|
|
|
|
Args:
|
|
md: MarkItDown instance
|
|
input_file: Path to PDF file
|
|
output_dir: Output directory
|
|
organize_by_year: Organize into year subdirectories
|
|
|
|
Returns:
|
|
Tuple of (success, metadata_dict)
|
|
"""
|
|
try:
|
|
print(f"Converting: {input_file.name}")
|
|
|
|
# Convert to Markdown
|
|
result = md.convert(str(input_file))
|
|
|
|
# Extract metadata from filename
|
|
metadata = extract_metadata_from_filename(input_file.name)
|
|
metadata['source_file'] = input_file.name
|
|
metadata['converted_date'] = datetime.now().isoformat()
|
|
|
|
# Try to extract title from content if not in filename
|
|
if 'title' not in metadata and result.title:
|
|
metadata['title'] = result.title
|
|
|
|
# Create output path
|
|
if organize_by_year and 'year' in metadata:
|
|
output_subdir = output_dir / metadata['year']
|
|
output_subdir.mkdir(parents=True, exist_ok=True)
|
|
else:
|
|
output_subdir = output_dir
|
|
output_subdir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_file = output_subdir / f"{input_file.stem}.md"
|
|
|
|
# Create formatted Markdown with front matter
|
|
content = "---\n"
|
|
content += f"title: \"{metadata.get('title', input_file.stem)}\"\n"
|
|
if 'author' in metadata:
|
|
content += f"author: \"{metadata['author']}\"\n"
|
|
if 'year' in metadata:
|
|
content += f"year: {metadata['year']}\n"
|
|
content += f"source: \"{metadata['source_file']}\"\n"
|
|
content += f"converted: \"{metadata['converted_date']}\"\n"
|
|
content += "---\n\n"
|
|
|
|
# Add title
|
|
content += f"# {metadata.get('title', input_file.stem)}\n\n"
|
|
|
|
# Add metadata section
|
|
content += "## Document Information\n\n"
|
|
if 'author' in metadata:
|
|
content += f"**Author**: {metadata['author']}\n"
|
|
if 'year' in metadata:
|
|
content += f"**Year**: {metadata['year']}\n"
|
|
content += f"**Source File**: {metadata['source_file']}\n"
|
|
content += f"**Converted**: {metadata['converted_date']}\n\n"
|
|
content += "---\n\n"
|
|
|
|
# Add content
|
|
content += result.text_content
|
|
|
|
# Write to file
|
|
output_file.write_text(content, encoding='utf-8')
|
|
|
|
print(f"✓ Saved to: {output_file}")
|
|
|
|
return True, metadata
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error converting {input_file.name}: {str(e)}")
|
|
return False, {'source_file': input_file.name, 'error': str(e)}
|
|
|
|
|
|
def create_index(papers: List[Dict], output_dir: Path):
|
|
"""Create an index/catalog of all converted papers."""
|
|
|
|
# Sort by year (if available) and title
|
|
papers_sorted = sorted(
|
|
papers,
|
|
key=lambda x: (x.get('year', '9999'), x.get('title', ''))
|
|
)
|
|
|
|
# Create Markdown index
|
|
index_content = "# Literature Review Index\n\n"
|
|
index_content += f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
|
|
index_content += f"**Total Papers**: {len(papers)}\n\n"
|
|
index_content += "---\n\n"
|
|
|
|
# Group by year
|
|
by_year = {}
|
|
for paper in papers_sorted:
|
|
year = paper.get('year', 'Unknown')
|
|
if year not in by_year:
|
|
by_year[year] = []
|
|
by_year[year].append(paper)
|
|
|
|
# Write by year
|
|
for year in sorted(by_year.keys()):
|
|
index_content += f"## {year}\n\n"
|
|
for paper in by_year[year]:
|
|
title = paper.get('title', paper.get('source_file', 'Unknown'))
|
|
author = paper.get('author', 'Unknown Author')
|
|
source = paper.get('source_file', '')
|
|
|
|
# Create link to markdown file
|
|
md_file = Path(source).stem + ".md"
|
|
if 'year' in paper and paper['year'] != 'Unknown':
|
|
md_file = f"{paper['year']}/{md_file}"
|
|
|
|
index_content += f"- **{title}**\n"
|
|
index_content += f" - Author: {author}\n"
|
|
index_content += f" - Source: {source}\n"
|
|
index_content += f" - [Read Markdown]({md_file})\n\n"
|
|
|
|
# Write index
|
|
index_file = output_dir / "INDEX.md"
|
|
index_file.write_text(index_content, encoding='utf-8')
|
|
print(f"\n✓ Created index: {index_file}")
|
|
|
|
# Also create JSON catalog
|
|
catalog_file = output_dir / "catalog.json"
|
|
with open(catalog_file, 'w', encoding='utf-8') as f:
|
|
json.dump(papers_sorted, f, indent=2, ensure_ascii=False)
|
|
print(f"✓ Created catalog: {catalog_file}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Convert scientific literature PDFs to Markdown",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Convert all PDFs in a directory
|
|
python convert_literature.py papers/ output/
|
|
|
|
# Organize by year
|
|
python convert_literature.py papers/ output/ --organize-by-year
|
|
|
|
# Create index of all papers
|
|
python convert_literature.py papers/ output/ --create-index
|
|
|
|
Filename Conventions:
|
|
For best results, name your PDFs using this pattern:
|
|
Author_Year_Title.pdf
|
|
|
|
Examples:
|
|
Smith_2023_Machine_Learning_Applications.pdf
|
|
Jones_2022_Climate_Change_Analysis.pdf
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('input_dir', type=Path, help='Directory with PDF files')
|
|
parser.add_argument('output_dir', type=Path, help='Output directory for Markdown files')
|
|
parser.add_argument(
|
|
'--organize-by-year', '-y',
|
|
action='store_true',
|
|
help='Organize output into year subdirectories'
|
|
)
|
|
parser.add_argument(
|
|
'--create-index', '-i',
|
|
action='store_true',
|
|
help='Create an index/catalog of all papers'
|
|
)
|
|
parser.add_argument(
|
|
'--recursive', '-r',
|
|
action='store_true',
|
|
help='Search subdirectories recursively'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate input
|
|
if not args.input_dir.exists():
|
|
print(f"Error: Input directory '{args.input_dir}' does not exist")
|
|
sys.exit(1)
|
|
|
|
if not args.input_dir.is_dir():
|
|
print(f"Error: '{args.input_dir}' is not a directory")
|
|
sys.exit(1)
|
|
|
|
# Find PDF files
|
|
if args.recursive:
|
|
pdf_files = list(args.input_dir.rglob("*.pdf"))
|
|
else:
|
|
pdf_files = list(args.input_dir.glob("*.pdf"))
|
|
|
|
if not pdf_files:
|
|
print("No PDF files found")
|
|
sys.exit(1)
|
|
|
|
print(f"Found {len(pdf_files)} PDF file(s)")
|
|
|
|
# Create MarkItDown instance
|
|
md = MarkItDown()
|
|
|
|
# Convert all papers
|
|
results = []
|
|
success_count = 0
|
|
|
|
for pdf_file in pdf_files:
|
|
success, metadata = convert_paper(
|
|
md,
|
|
pdf_file,
|
|
args.output_dir,
|
|
args.organize_by_year
|
|
)
|
|
|
|
if success:
|
|
success_count += 1
|
|
results.append(metadata)
|
|
|
|
# Create index if requested
|
|
if args.create_index and results:
|
|
create_index(results, args.output_dir)
|
|
|
|
# Print summary
|
|
print("\n" + "="*50)
|
|
print("CONVERSION SUMMARY")
|
|
print("="*50)
|
|
print(f"Total papers: {len(pdf_files)}")
|
|
print(f"Successful: {success_count}")
|
|
print(f"Failed: {len(pdf_files) - success_count}")
|
|
print(f"Success rate: {success_count/len(pdf_files)*100:.1f}%")
|
|
|
|
sys.exit(0 if success_count == len(pdf_files) else 1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|