gh-k-dense-ai-claude-scient…/skills/markitdown/scripts/convert_literature.py

#!/usr/bin/env python3
"""
Convert scientific literature PDFs to Markdown for analysis and review.

This script is specifically designed for converting academic papers,
organizing them, and preparing them for literature review workflows.
"""

import argparse
import json
import re
import sys
from pathlib import Path
from typing import List, Dict, Optional
from markitdown import MarkItDown
from datetime import datetime


def extract_metadata_from_filename(filename: str) -> Dict[str, str]:
    """
    Try to extract metadata from filename.
    Supports patterns like: Author_Year_Title.pdf
    """
    metadata = {}

    # Remove extension
    name = Path(filename).stem

    # Try to extract year
    year_match = re.search(r'\b(19|20)\d{2}\b', name)
    if year_match:
        metadata['year'] = year_match.group()

    # Split by underscores or dashes
    parts = re.split(r'[_\-]', name)
    if len(parts) >= 2:
        metadata['author'] = parts[0].replace('_', ' ')
        metadata['title'] = ' '.join(parts[1:]).replace('_', ' ')
    else:
        metadata['title'] = name.replace('_', ' ')

    return metadata


def convert_paper(
    md: MarkItDown,
    input_file: Path,
    output_dir: Path,
    organize_by_year: bool = False
) -> tuple[bool, Dict]:
    """
    Convert a single paper to Markdown with metadata extraction.

    Args:
        md: MarkItDown instance
        input_file: Path to PDF file
        output_dir: Output directory
        organize_by_year: Organize into year subdirectories

    Returns:
        Tuple of (success, metadata_dict)
    """
    try:
        print(f"Converting: {input_file.name}")

        # Convert to Markdown
        result = md.convert(str(input_file))

        # Extract metadata from filename
        metadata = extract_metadata_from_filename(input_file.name)
        metadata['source_file'] = input_file.name
        metadata['converted_date'] = datetime.now().isoformat()

        # Try to extract title from content if not in filename
        if 'title' not in metadata and result.title:
            metadata['title'] = result.title

        # Create output path
        if organize_by_year and 'year' in metadata:
            output_subdir = output_dir / metadata['year']
            output_subdir.mkdir(parents=True, exist_ok=True)
        else:
            output_subdir = output_dir
            output_subdir.mkdir(parents=True, exist_ok=True)

        output_file = output_subdir / f"{input_file.stem}.md"

        # Create formatted Markdown with front matter
        content = "---\n"
        content += f"title: \"{metadata.get('title', input_file.stem)}\"\n"
        if 'author' in metadata:
            content += f"author: \"{metadata['author']}\"\n"
        if 'year' in metadata:
            content += f"year: {metadata['year']}\n"
        content += f"source: \"{metadata['source_file']}\"\n"
        content += f"converted: \"{metadata['converted_date']}\"\n"
        content += "---\n\n"

        # Add title
        content += f"# {metadata.get('title', input_file.stem)}\n\n"

        # Add metadata section
        content += "## Document Information\n\n"
        if 'author' in metadata:
            content += f"**Author**: {metadata['author']}\n"
        if 'year' in metadata:
            content += f"**Year**: {metadata['year']}\n"
        content += f"**Source File**: {metadata['source_file']}\n"
        content += f"**Converted**: {metadata['converted_date']}\n\n"
        content += "---\n\n"

        # Add content
        content += result.text_content

        # Write to file
        output_file.write_text(content, encoding='utf-8')

        print(f"✓ Saved to: {output_file}")

        return True, metadata

    except Exception as e:
        print(f"✗ Error converting {input_file.name}: {str(e)}")
        return False, {'source_file': input_file.name, 'error': str(e)}


def create_index(papers: List[Dict], output_dir: Path):
    """Create an index/catalog of all converted papers."""

    # Sort by year (if available) and title
    papers_sorted = sorted(
        papers,
        key=lambda x: (x.get('year', '9999'), x.get('title', ''))
    )

    # Create Markdown index
    index_content = "# Literature Review Index\n\n"
    index_content += f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
    index_content += f"**Total Papers**: {len(papers)}\n\n"
    index_content += "---\n\n"

    # Group by year
    by_year = {}
    for paper in papers_sorted:
        year = paper.get('year', 'Unknown')
        if year not in by_year:
            by_year[year] = []
        by_year[year].append(paper)

    # Write by year
    for year in sorted(by_year.keys()):
        index_content += f"## {year}\n\n"
        for paper in by_year[year]:
            title = paper.get('title', paper.get('source_file', 'Unknown'))
            author = paper.get('author', 'Unknown Author')
            source = paper.get('source_file', '')

            # Create link to markdown file
            md_file = Path(source).stem + ".md"
            if 'year' in paper and paper['year'] != 'Unknown':
                md_file = f"{paper['year']}/{md_file}"

            index_content += f"- **{title}**\n"
            index_content += f"  - Author: {author}\n"
            index_content += f"  - Source: {source}\n"
            index_content += f"  - [Read Markdown]({md_file})\n\n"

    # Write index
    index_file = output_dir / "INDEX.md"
    index_file.write_text(index_content, encoding='utf-8')
    print(f"\n✓ Created index: {index_file}")

    # Also create JSON catalog
    catalog_file = output_dir / "catalog.json"
    with open(catalog_file, 'w', encoding='utf-8') as f:
        json.dump(papers_sorted, f, indent=2, ensure_ascii=False)
    print(f"✓ Created catalog: {catalog_file}")


def main():
    parser = argparse.ArgumentParser(
        description="Convert scientific literature PDFs to Markdown",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Convert all PDFs in a directory
  python convert_literature.py papers/ output/

  # Organize by year
  python convert_literature.py papers/ output/ --organize-by-year

  # Create index of all papers
  python convert_literature.py papers/ output/ --create-index

Filename Conventions:
  For best results, name your PDFs using this pattern:
    Author_Year_Title.pdf

  Examples:
    Smith_2023_Machine_Learning_Applications.pdf
    Jones_2022_Climate_Change_Analysis.pdf
        """
    )

    parser.add_argument('input_dir', type=Path, help='Directory with PDF files')
    parser.add_argument('output_dir', type=Path, help='Output directory for Markdown files')
    parser.add_argument(
        '--organize-by-year', '-y',
        action='store_true',
        help='Organize output into year subdirectories'
    )
    parser.add_argument(
        '--create-index', '-i',
        action='store_true',
        help='Create an index/catalog of all papers'
    )
    parser.add_argument(
        '--recursive', '-r',
        action='store_true',
        help='Search subdirectories recursively'
    )

    args = parser.parse_args()

    # Validate input
    if not args.input_dir.exists():
        print(f"Error: Input directory '{args.input_dir}' does not exist")
        sys.exit(1)

    if not args.input_dir.is_dir():
        print(f"Error: '{args.input_dir}' is not a directory")
        sys.exit(1)

    # Find PDF files
    if args.recursive:
        pdf_files = list(args.input_dir.rglob("*.pdf"))
    else:
        pdf_files = list(args.input_dir.glob("*.pdf"))

    if not pdf_files:
        print("No PDF files found")
        sys.exit(1)

    print(f"Found {len(pdf_files)} PDF file(s)")

    # Create MarkItDown instance
    md = MarkItDown()

    # Convert all papers
    results = []
    success_count = 0

    for pdf_file in pdf_files:
        success, metadata = convert_paper(
            md,
            pdf_file,
            args.output_dir,
            args.organize_by_year
        )

        if success:
            success_count += 1
            results.append(metadata)

    # Create index if requested
    if args.create_index and results:
        create_index(results, args.output_dir)

    # Print summary
    print("\n" + "="*50)
    print("CONVERSION SUMMARY")
    print("="*50)
    print(f"Total papers:    {len(pdf_files)}")
    print(f"Successful:      {success_count}")
    print(f"Failed:          {len(pdf_files) - success_count}")
    print(f"Success rate:    {success_count/len(pdf_files)*100:.1f}%")

    sys.exit(0 if success_count == len(pdf_files) else 1)


if __name__ == '__main__':
    main()