Initial commit

2025-11-30 08:30:18 +08:00
commit 74bee324ab
335 changed files with 147377 additions and 0 deletions
--- a/skills/markitdown/scripts/batch_convert.py
+++ b/skills/markitdown/scripts/batch_convert.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+Batch convert multiple files to Markdown using MarkItDown.
+
+This script demonstrates how to efficiently convert multiple files
+in a directory to Markdown format.
+"""
+
+import argparse
+from pathlib import Path
+from typing import List, Optional
+from markitdown import MarkItDown
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import sys
+
+
+def convert_file(md: MarkItDown, file_path: Path, output_dir: Path, verbose: bool = False) -> tuple[bool, str, str]:
+    """
+    Convert a single file to Markdown.
+    
+    Args:
+        md: MarkItDown instance
+        file_path: Path to input file
+        output_dir: Directory for output files
+        verbose: Print detailed messages
+        
+    Returns:
+        Tuple of (success, input_path, message)
+    """
+    try:
+        if verbose:
+            print(f"Converting: {file_path}")
+        
+        result = md.convert(str(file_path))
+        
+        # Create output path
+        output_file = output_dir / f"{file_path.stem}.md"
+        
+        # Write content with metadata header
+        content = f"# {result.title or file_path.stem}\n\n"
+        content += f"**Source**: {file_path.name}\n"
+        content += f"**Format**: {file_path.suffix}\n\n"
+        content += "---\n\n"
+        content += result.text_content
+        
+        output_file.write_text(content, encoding='utf-8')
+        
+        return True, str(file_path), f"✓ Converted to {output_file.name}"
+        
+    except Exception as e:
+        return False, str(file_path), f"✗ Error: {str(e)}"
+
+
+def batch_convert(
+    input_dir: Path,
+    output_dir: Path,
+    extensions: Optional[List[str]] = None,
+    recursive: bool = False,
+    workers: int = 4,
+    verbose: bool = False,
+    enable_plugins: bool = False
+) -> dict:
+    """
+    Batch convert files in a directory.
+    
+    Args:
+        input_dir: Input directory
+        output_dir: Output directory
+        extensions: List of file extensions to convert (e.g., ['.pdf', '.docx'])
+        recursive: Search subdirectories
+        workers: Number of parallel workers
+        verbose: Print detailed messages
+        enable_plugins: Enable MarkItDown plugins
+        
+    Returns:
+        Dictionary with conversion statistics
+    """
+    # Create output directory
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Default extensions if not specified
+    if extensions is None:
+        extensions = ['.pdf', '.docx', '.pptx', '.xlsx', '.html', '.jpg', '.png']
+    
+    # Find files
+    files = []
+    if recursive:
+        for ext in extensions:
+            files.extend(input_dir.rglob(f"*{ext}"))
+    else:
+        for ext in extensions:
+            files.extend(input_dir.glob(f"*{ext}"))
+    
+    if not files:
+        print(f"No files found with extensions: {', '.join(extensions)}")
+        return {'total': 0, 'success': 0, 'failed': 0}
+    
+    print(f"Found {len(files)} file(s) to convert")
+    
+    # Create MarkItDown instance
+    md = MarkItDown(enable_plugins=enable_plugins)
+    
+    # Convert files in parallel
+    results = {
+        'total': len(files),
+        'success': 0,
+        'failed': 0,
+        'details': []
+    }
+    
+    with ThreadPoolExecutor(max_workers=workers) as executor:
+        futures = {
+            executor.submit(convert_file, md, file_path, output_dir, verbose): file_path
+            for file_path in files
+        }
+        
+        for future in as_completed(futures):
+            success, path, message = future.result()
+            
+            if success:
+                results['success'] += 1
+            else:
+                results['failed'] += 1
+            
+            results['details'].append({
+                'file': path,
+                'success': success,
+                'message': message
+            })
+            
+            print(message)
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Batch convert files to Markdown using MarkItDown",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Convert all PDFs in a directory
+  python batch_convert.py papers/ output/ --extensions .pdf
+  
+  # Convert multiple formats recursively
+  python batch_convert.py documents/ markdown/ --extensions .pdf .docx .pptx -r
+  
+  # Use 8 parallel workers
+  python batch_convert.py input/ output/ --workers 8
+  
+  # Enable plugins
+  python batch_convert.py input/ output/ --plugins
+        """
+    )
+    
+    parser.add_argument('input_dir', type=Path, help='Input directory')
+    parser.add_argument('output_dir', type=Path, help='Output directory')
+    parser.add_argument(
+        '--extensions', '-e',
+        nargs='+',
+        help='File extensions to convert (e.g., .pdf .docx)'
+    )
+    parser.add_argument(
+        '--recursive', '-r',
+        action='store_true',
+        help='Search subdirectories recursively'
+    )
+    parser.add_argument(
+        '--workers', '-w',
+        type=int,
+        default=4,
+        help='Number of parallel workers (default: 4)'
+    )
+    parser.add_argument(
+        '--verbose', '-v',
+        action='store_true',
+        help='Verbose output'
+    )
+    parser.add_argument(
+        '--plugins', '-p',
+        action='store_true',
+        help='Enable MarkItDown plugins'
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate input directory
+    if not args.input_dir.exists():
+        print(f"Error: Input directory '{args.input_dir}' does not exist")
+        sys.exit(1)
+    
+    if not args.input_dir.is_dir():
+        print(f"Error: '{args.input_dir}' is not a directory")
+        sys.exit(1)
+    
+    # Run batch conversion
+    results = batch_convert(
+        input_dir=args.input_dir,
+        output_dir=args.output_dir,
+        extensions=args.extensions,
+        recursive=args.recursive,
+        workers=args.workers,
+        verbose=args.verbose,
+        enable_plugins=args.plugins
+    )
+    
+    # Print summary
+    print("\n" + "="*50)
+    print("CONVERSION SUMMARY")
+    print("="*50)
+    print(f"Total files:     {results['total']}")
+    print(f"Successful:      {results['success']}")
+    print(f"Failed:          {results['failed']}")
+    print(f"Success rate:    {results['success']/results['total']*100:.1f}%" if results['total'] > 0 else "N/A")
+    
+    # Show failed files if any
+    if results['failed'] > 0:
+        print("\nFailed conversions:")
+        for detail in results['details']:
+            if not detail['success']:
+                print(f"  - {detail['file']}: {detail['message']}")
+    
+    sys.exit(0 if results['failed'] == 0 else 1)
+
+
+if __name__ == '__main__':
+    main()
+
--- a/skills/markitdown/scripts/convert_literature.py
+++ b/skills/markitdown/scripts/convert_literature.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+Convert scientific literature PDFs to Markdown for analysis and review.
+
+This script is specifically designed for converting academic papers,
+organizing them, and preparing them for literature review workflows.
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import List, Dict, Optional
+from markitdown import MarkItDown
+from datetime import datetime
+
+
+def extract_metadata_from_filename(filename: str) -> Dict[str, str]:
+    """
+    Try to extract metadata from filename.
+    Supports patterns like: Author_Year_Title.pdf
+    """
+    metadata = {}
+    
+    # Remove extension
+    name = Path(filename).stem
+    
+    # Try to extract year
+    year_match = re.search(r'\b(19|20)\d{2}\b', name)
+    if year_match:
+        metadata['year'] = year_match.group()
+    
+    # Split by underscores or dashes
+    parts = re.split(r'[_\-]', name)
+    if len(parts) >= 2:
+        metadata['author'] = parts[0].replace('_', ' ')
+        metadata['title'] = ' '.join(parts[1:]).replace('_', ' ')
+    else:
+        metadata['title'] = name.replace('_', ' ')
+    
+    return metadata
+
+
+def convert_paper(
+    md: MarkItDown,
+    input_file: Path,
+    output_dir: Path,
+    organize_by_year: bool = False
+) -> tuple[bool, Dict]:
+    """
+    Convert a single paper to Markdown with metadata extraction.
+    
+    Args:
+        md: MarkItDown instance
+        input_file: Path to PDF file
+        output_dir: Output directory
+        organize_by_year: Organize into year subdirectories
+        
+    Returns:
+        Tuple of (success, metadata_dict)
+    """
+    try:
+        print(f"Converting: {input_file.name}")
+        
+        # Convert to Markdown
+        result = md.convert(str(input_file))
+        
+        # Extract metadata from filename
+        metadata = extract_metadata_from_filename(input_file.name)
+        metadata['source_file'] = input_file.name
+        metadata['converted_date'] = datetime.now().isoformat()
+        
+        # Try to extract title from content if not in filename
+        if 'title' not in metadata and result.title:
+            metadata['title'] = result.title
+        
+        # Create output path
+        if organize_by_year and 'year' in metadata:
+            output_subdir = output_dir / metadata['year']
+            output_subdir.mkdir(parents=True, exist_ok=True)
+        else:
+            output_subdir = output_dir
+            output_subdir.mkdir(parents=True, exist_ok=True)
+        
+        output_file = output_subdir / f"{input_file.stem}.md"
+        
+        # Create formatted Markdown with front matter
+        content = "---\n"
+        content += f"title: \"{metadata.get('title', input_file.stem)}\"\n"
+        if 'author' in metadata:
+            content += f"author: \"{metadata['author']}\"\n"
+        if 'year' in metadata:
+            content += f"year: {metadata['year']}\n"
+        content += f"source: \"{metadata['source_file']}\"\n"
+        content += f"converted: \"{metadata['converted_date']}\"\n"
+        content += "---\n\n"
+        
+        # Add title
+        content += f"# {metadata.get('title', input_file.stem)}\n\n"
+        
+        # Add metadata section
+        content += "## Document Information\n\n"
+        if 'author' in metadata:
+            content += f"**Author**: {metadata['author']}\n"
+        if 'year' in metadata:
+            content += f"**Year**: {metadata['year']}\n"
+        content += f"**Source File**: {metadata['source_file']}\n"
+        content += f"**Converted**: {metadata['converted_date']}\n\n"
+        content += "---\n\n"
+        
+        # Add content
+        content += result.text_content
+        
+        # Write to file
+        output_file.write_text(content, encoding='utf-8')
+        
+        print(f"✓ Saved to: {output_file}")
+        
+        return True, metadata
+        
+    except Exception as e:
+        print(f"✗ Error converting {input_file.name}: {str(e)}")
+        return False, {'source_file': input_file.name, 'error': str(e)}
+
+
+def create_index(papers: List[Dict], output_dir: Path):
+    """Create an index/catalog of all converted papers."""
+    
+    # Sort by year (if available) and title
+    papers_sorted = sorted(
+        papers,
+        key=lambda x: (x.get('year', '9999'), x.get('title', ''))
+    )
+    
+    # Create Markdown index
+    index_content = "# Literature Review Index\n\n"
+    index_content += f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
+    index_content += f"**Total Papers**: {len(papers)}\n\n"
+    index_content += "---\n\n"
+    
+    # Group by year
+    by_year = {}
+    for paper in papers_sorted:
+        year = paper.get('year', 'Unknown')
+        if year not in by_year:
+            by_year[year] = []
+        by_year[year].append(paper)
+    
+    # Write by year
+    for year in sorted(by_year.keys()):
+        index_content += f"## {year}\n\n"
+        for paper in by_year[year]:
+            title = paper.get('title', paper.get('source_file', 'Unknown'))
+            author = paper.get('author', 'Unknown Author')
+            source = paper.get('source_file', '')
+            
+            # Create link to markdown file
+            md_file = Path(source).stem + ".md"
+            if 'year' in paper and paper['year'] != 'Unknown':
+                md_file = f"{paper['year']}/{md_file}"
+            
+            index_content += f"- **{title}**\n"
+            index_content += f"  - Author: {author}\n"
+            index_content += f"  - Source: {source}\n"
+            index_content += f"  - [Read Markdown]({md_file})\n\n"
+    
+    # Write index
+    index_file = output_dir / "INDEX.md"
+    index_file.write_text(index_content, encoding='utf-8')
+    print(f"\n✓ Created index: {index_file}")
+    
+    # Also create JSON catalog
+    catalog_file = output_dir / "catalog.json"
+    with open(catalog_file, 'w', encoding='utf-8') as f:
+        json.dump(papers_sorted, f, indent=2, ensure_ascii=False)
+    print(f"✓ Created catalog: {catalog_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert scientific literature PDFs to Markdown",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Convert all PDFs in a directory
+  python convert_literature.py papers/ output/
+  
+  # Organize by year
+  python convert_literature.py papers/ output/ --organize-by-year
+  
+  # Create index of all papers
+  python convert_literature.py papers/ output/ --create-index
+  
+Filename Conventions:
+  For best results, name your PDFs using this pattern:
+    Author_Year_Title.pdf
+    
+  Examples:
+    Smith_2023_Machine_Learning_Applications.pdf
+    Jones_2022_Climate_Change_Analysis.pdf
+        """
+    )
+    
+    parser.add_argument('input_dir', type=Path, help='Directory with PDF files')
+    parser.add_argument('output_dir', type=Path, help='Output directory for Markdown files')
+    parser.add_argument(
+        '--organize-by-year', '-y',
+        action='store_true',
+        help='Organize output into year subdirectories'
+    )
+    parser.add_argument(
+        '--create-index', '-i',
+        action='store_true',
+        help='Create an index/catalog of all papers'
+    )
+    parser.add_argument(
+        '--recursive', '-r',
+        action='store_true',
+        help='Search subdirectories recursively'
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate input
+    if not args.input_dir.exists():
+        print(f"Error: Input directory '{args.input_dir}' does not exist")
+        sys.exit(1)
+    
+    if not args.input_dir.is_dir():
+        print(f"Error: '{args.input_dir}' is not a directory")
+        sys.exit(1)
+    
+    # Find PDF files
+    if args.recursive:
+        pdf_files = list(args.input_dir.rglob("*.pdf"))
+    else:
+        pdf_files = list(args.input_dir.glob("*.pdf"))
+    
+    if not pdf_files:
+        print("No PDF files found")
+        sys.exit(1)
+    
+    print(f"Found {len(pdf_files)} PDF file(s)")
+    
+    # Create MarkItDown instance
+    md = MarkItDown()
+    
+    # Convert all papers
+    results = []
+    success_count = 0
+    
+    for pdf_file in pdf_files:
+        success, metadata = convert_paper(
+            md,
+            pdf_file,
+            args.output_dir,
+            args.organize_by_year
+        )
+        
+        if success:
+            success_count += 1
+            results.append(metadata)
+    
+    # Create index if requested
+    if args.create_index and results:
+        create_index(results, args.output_dir)
+    
+    # Print summary
+    print("\n" + "="*50)
+    print("CONVERSION SUMMARY")
+    print("="*50)
+    print(f"Total papers:    {len(pdf_files)}")
+    print(f"Successful:      {success_count}")
+    print(f"Failed:          {len(pdf_files) - success_count}")
+    print(f"Success rate:    {success_count/len(pdf_files)*100:.1f}%")
+    
+    sys.exit(0 if success_count == len(pdf_files) else 1)
+
+
+if __name__ == '__main__':
+    main()
+
--- a/skills/markitdown/scripts/convert_with_ai.py
+++ b/skills/markitdown/scripts/convert_with_ai.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+"""
+Convert documents to Markdown with AI-enhanced image descriptions.
+
+This script demonstrates how to use MarkItDown with OpenRouter to generate
+detailed descriptions of images in documents (PowerPoint, PDFs with images, etc.)
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+from markitdown import MarkItDown
+from openai import OpenAI
+
+
+# Predefined prompts for different use cases
+PROMPTS = {
+    'scientific': """
+Analyze this scientific image or diagram. Provide:
+1. Type of visualization (graph, chart, microscopy, diagram, etc.)
+2. Key data points, trends, or patterns
+3. Axes labels, legends, and scales
+4. Notable features or findings
+5. Scientific context and significance
+Be precise, technical, and detailed.
+    """.strip(),
+    
+    'presentation': """
+Describe this presentation slide image. Include:
+1. Main visual elements and their arrangement
+2. Key points or messages conveyed
+3. Data or information presented
+4. Visual hierarchy and emphasis
+Keep the description clear and informative.
+    """.strip(),
+    
+    'general': """
+Describe this image in detail. Include:
+1. Main subjects and objects
+2. Visual composition and layout
+3. Text content (if any)
+4. Notable details
+5. Overall context and purpose
+Be comprehensive and accurate.
+    """.strip(),
+    
+    'data_viz': """
+Analyze this data visualization. Provide:
+1. Type of chart/graph (bar, line, scatter, pie, etc.)
+2. Variables and axes
+3. Data ranges and scales
+4. Key patterns, trends, or outliers
+5. Statistical insights
+Focus on quantitative accuracy.
+    """.strip(),
+    
+    'medical': """
+Describe this medical image. Include:
+1. Type of medical imaging (X-ray, MRI, CT, microscopy, etc.)
+2. Anatomical structures visible
+3. Notable findings or abnormalities
+4. Image quality and contrast
+5. Clinical relevance
+Be professional and precise.
+    """.strip()
+}
+
+
+def convert_with_ai(
+    input_file: Path,
+    output_file: Path,
+    api_key: str,
+    model: str = "anthropic/claude-sonnet-4.5",
+    prompt_type: str = "general",
+    custom_prompt: str = None
+) -> bool:
+    """
+    Convert a file to Markdown with AI image descriptions.
+    
+    Args:
+        input_file: Path to input file
+        output_file: Path to output Markdown file
+        api_key: OpenRouter API key
+        model: Model name (default: anthropic/claude-sonnet-4.5)
+        prompt_type: Type of prompt to use
+        custom_prompt: Custom prompt (overrides prompt_type)
+        
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        # Initialize OpenRouter client (OpenAI-compatible)
+        client = OpenAI(
+            api_key=api_key,
+            base_url="https://openrouter.ai/api/v1"
+        )
+        
+        # Select prompt
+        if custom_prompt:
+            prompt = custom_prompt
+        else:
+            prompt = PROMPTS.get(prompt_type, PROMPTS['general'])
+        
+        print(f"Using model: {model}")
+        print(f"Prompt type: {prompt_type if not custom_prompt else 'custom'}")
+        print(f"Converting: {input_file}")
+        
+        # Create MarkItDown with AI support
+        md = MarkItDown(
+            llm_client=client,
+            llm_model=model,
+            llm_prompt=prompt
+        )
+        
+        # Convert file
+        result = md.convert(str(input_file))
+        
+        # Create output with metadata
+        content = f"# {result.title or input_file.stem}\n\n"
+        content += f"**Source**: {input_file.name}\n"
+        content += f"**Format**: {input_file.suffix}\n"
+        content += f"**AI Model**: {model}\n"
+        content += f"**Prompt Type**: {prompt_type if not custom_prompt else 'custom'}\n\n"
+        content += "---\n\n"
+        content += result.text_content
+        
+        # Write output
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        output_file.write_text(content, encoding='utf-8')
+        
+        print(f"✓ Successfully converted to: {output_file}")
+        return True
+        
+    except Exception as e:
+        print(f"✗ Error: {str(e)}", file=sys.stderr)
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert documents to Markdown with AI-enhanced image descriptions",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=f"""
+Available prompt types:
+  scientific    - For scientific diagrams, graphs, and charts
+  presentation  - For presentation slides
+  general       - General-purpose image description
+  data_viz      - For data visualizations and charts
+  medical       - For medical imaging
+
+Examples:
+  # Convert a scientific paper
+  python convert_with_ai.py paper.pdf output.md --prompt-type scientific
+  
+  # Convert a presentation with custom model
+  python convert_with_ai.py slides.pptx slides.md --model anthropic/claude-sonnet-4.5 --prompt-type presentation
+  
+  # Use custom prompt with Claude Sonnet 4.5
+  python convert_with_ai.py diagram.png diagram.md --model anthropic/claude-sonnet-4.5 --custom-prompt "Describe this technical diagram"
+  
+  # Set API key via environment variable
+  export OPENROUTER_API_KEY="sk-or-v1-..."
+  python convert_with_ai.py image.jpg image.md
+
+Environment Variables:
+  OPENROUTER_API_KEY    OpenRouter API key (required if not passed via --api-key)
+
+Popular Models (use with --model):
+  anthropic/claude-sonnet-4.5 - Claude Sonnet 4.5 (recommended, vision support)
+  anthropic/claude-3.5-sonnet - Claude 3.5 Sonnet (vision support)
+  openai/gpt-4o              - GPT-4 Omni (vision support)
+  openai/gpt-4-vision        - GPT-4 Vision
+  google/gemini-pro-vision   - Gemini Pro Vision
+        """
+    )
+    
+    parser.add_argument('input', type=Path, help='Input file')
+    parser.add_argument('output', type=Path, help='Output Markdown file')
+    parser.add_argument(
+        '--api-key', '-k',
+        help='OpenRouter API key (or set OPENROUTER_API_KEY env var)'
+    )
+    parser.add_argument(
+        '--model', '-m',
+        default='anthropic/claude-sonnet-4.5',
+        help='Model to use via OpenRouter (default: anthropic/claude-sonnet-4.5)'
+    )
+    parser.add_argument(
+        '--prompt-type', '-t',
+        choices=list(PROMPTS.keys()),
+        default='general',
+        help='Type of prompt to use (default: general)'
+    )
+    parser.add_argument(
+        '--custom-prompt', '-p',
+        help='Custom prompt (overrides --prompt-type)'
+    )
+    parser.add_argument(
+        '--list-prompts', '-l',
+        action='store_true',
+        help='List available prompt types and exit'
+    )
+    
+    args = parser.parse_args()
+    
+    # List prompts and exit
+    if args.list_prompts:
+        print("Available prompt types:\n")
+        for name, prompt in PROMPTS.items():
+            print(f"[{name}]")
+            print(prompt)
+            print("\n" + "="*60 + "\n")
+        sys.exit(0)
+    
+    # Get API key
+    api_key = args.api_key or os.environ.get('OPENROUTER_API_KEY')
+    if not api_key:
+        print("Error: OpenRouter API key required. Set OPENROUTER_API_KEY environment variable or use --api-key")
+        print("Get your API key at: https://openrouter.ai/keys")
+        sys.exit(1)
+    
+    # Validate input file
+    if not args.input.exists():
+        print(f"Error: Input file '{args.input}' does not exist")
+        sys.exit(1)
+    
+    # Convert file
+    success = convert_with_ai(
+        input_file=args.input,
+        output_file=args.output,
+        api_key=api_key,
+        model=args.model,
+        prompt_type=args.prompt_type,
+        custom_prompt=args.custom_prompt
+    )
+    
+    sys.exit(0 if success else 1)
+
+
+if __name__ == '__main__':
+    main()
+