commit 3182431ecf4a9a892bf2a4c80833ea14598d19d2 Author: Zhongwei Li Date: Sat Nov 29 18:23:31 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..9de8422 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "ollama-deepseek-ocr-tool", + "description": "A CLI tool that loads images, sends a prompt to ollama to invoke deepseek-ocr and instructs it to return the image as markdown", + "version": "0.1.0", + "author": { + "name": "Dennis Vriend", + "email": "dvriend@ilionx.com" + }, + "skills": [ + "./skills" + ], + "commands": [ + "./commands" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e75958a --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# ollama-deepseek-ocr-tool + +A CLI tool that loads images, sends a prompt to ollama to invoke deepseek-ocr and instructs it to return the image as markdown diff --git a/commands/help.md b/commands/help.md new file mode 100644 index 0000000..f1b2852 --- /dev/null +++ b/commands/help.md @@ -0,0 +1,61 @@ +--- +description: Show help information for ollama-deepseek-ocr-tool +argument-hint: none +--- + +Display help information for ollama-deepseek-ocr-tool, a CLI tool for batch OCR processing +using DeepSeek-OCR via Ollama. Converts image sequences to markdown documents. + +## Usage + +```bash +# Show full help with examples and troubleshooting +ollama-deepseek-ocr-tool --help + +# Show version +ollama-deepseek-ocr-tool --version +``` + +## Options + +- `--help` / `-h`: Show help with examples, prerequisites, and troubleshooting +- `--version`: Show version number +- `-v`, `-vv`, `-vvv`: Verbosity levels (INFO, DEBUG, TRACE) + +## Examples + +```bash +# Get help (shows progressive examples) +ollama-deepseek-ocr-tool --help + +# Check version +ollama-deepseek-ocr-tool --version + +# Basic usage (from help) +ollama-deepseek-ocr-tool "*.png" output.md + +# With verbose logging +ollama-deepseek-ocr-tool "*.png" output.md -vv +``` + +## What Help Shows + +The `--help` output includes: +- **Description**: What the tool does +- **Arguments**: GLOB_PATTERN and OUTPUT_FILE +- **Examples**: Progressive from simple to complex +- **Output Format**: How the markdown is structured +- **Prerequisites**: Ollama setup steps +- **Troubleshooting**: Common errors and solutions + +## Quick Start + +```bash +# 1. Install prerequisites +brew install ollama +ollama serve +ollama pull deepseek-ocr + +# 2. Process images +ollama-deepseek-ocr-tool "IMG_*.png" chapter.md +``` diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..6654b75 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,49 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:dnvriend/ollama-deepseek-ocr-tool:plugins/ollama-deepseek-ocr-tool", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "ac93b6127e632488400c0081e5614a45ce98d40f", + "treeHash": "ad6930ae20fed449775ceca7701a19abba47996c443b36f8c5559f2c18ba12f7", + "generatedAt": "2025-11-28T10:16:36.987596Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "ollama-deepseek-ocr-tool", + "description": "A CLI tool that loads images, sends a prompt to ollama to invoke deepseek-ocr and instructs it to return the image as markdown", + "version": "0.1.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "2b3554b3c7e485b93d6a2a94e0a8ae17fd2aaa37bb714fc6e0effc0b64bc0398" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "7842278bcd56e5dc45cd0b49c2f14e1cbf2f508bcb5b897c893160d0ff04a185" + }, + { + "path": "commands/help.md", + "sha256": "09ac30b7e0bab611f436f02b8bdeb075c8069da8ba7b8d2caa37646d0f4da39c" + }, + { + "path": "skills/ollama-deepseek-ocr-tool/SKILL.md", + "sha256": "0145f404eb17895139f9af37889e87e77b4ee7fce5c5ef312c457aabe9d51127" + } + ], + "dirSha256": "ad6930ae20fed449775ceca7701a19abba47996c443b36f8c5559f2c18ba12f7" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/ollama-deepseek-ocr-tool/SKILL.md b/skills/ollama-deepseek-ocr-tool/SKILL.md new file mode 100644 index 0000000..108ebd7 --- /dev/null +++ b/skills/ollama-deepseek-ocr-tool/SKILL.md @@ -0,0 +1,258 @@ +--- +name: skill-ollama-deepseek-ocr-tool +description: Batch OCR processing with DeepSeek-OCR via Ollama +--- + +# When to use + +- Convert textbook/lecture images to markdown notes +- Batch OCR processing of scanned documents +- Extract text from image sequences (iPhone photos, screenshots) +- Create searchable markdown from visual content +- Process documents privately without cloud services + +# ollama-deepseek-ocr-tool Skill + +## Purpose + +This skill provides access to `ollama-deepseek-ocr-tool`, a CLI tool for fast, private batch OCR processing using DeepSeek-OCR via Ollama. Converts sequences of images (textbook pages, slides, scans) into a single coherent markdown document. + +**Key capabilities:** +- ⚡ Fast processing (~3s per image on M4) +- 🔒 Private - runs entirely locally +- 📝 Clean markdown output (tables, headings, lists) +- 🔄 Natural sorting (IMG_1 < IMG_2 < IMG_10) +- 💰 Free - no API costs or rate limits + +## When to Use This Skill + +**Use this skill when:** +- Converting textbook chapters to Obsidian notes +- Processing lecture slides or handouts to markdown +- Extracting text from scanned documents +- Creating searchable study materials from images +- Need comprehensive examples and troubleshooting + +**Do NOT use this skill for:** +- Cloud-based OCR (this is local-only) +- Describing image content (extracts text only) +- Handwritten text recognition (printed text only) +- Real-time streaming OCR (batch processing only) + +## CLI Tool: ollama-deepseek-ocr-tool + +The `ollama-deepseek-ocr-tool` processes multiple images in sequence and creates a single markdown document with extracted text. Images are sorted naturally and text is appended sequentially for coherent reading. + +### Installation + +```bash +# Clone and install +git clone https://github.com/dnvriend/ollama-deepseek-ocr-tool.git +cd ollama-deepseek-ocr-tool +uv tool install . +``` + +### Prerequisites + +1. **Ollama** - Local LLM runtime + ```bash + brew install ollama + ollama serve + ``` + +2. **DeepSeek-OCR model** (~6GB download) + ```bash + ollama pull deepseek-ocr + ``` + +3. **Python 3.14+** and **uv package manager** + +### Quick Start + +```bash +# Example 1: Process textbook chapter from iPhone photos +ollama-deepseek-ocr-tool "IMG_*.png" chapter-3-notes.md + +# Example 2: Convert lecture slides to markdown +ollama-deepseek-ocr-tool "lecture-week5/*.jpg" week5-summary.md + +# Example 3: With verbose logging to debug issues +ollama-deepseek-ocr-tool "*.png" output.md -vv +``` + +### Main Command - Batch OCR Processing + +Process images matching a glob pattern and create a markdown document. + +**Usage:** +```bash +ollama-deepseek-ocr-tool GLOB_PATTERN OUTPUT_FILE [OPTIONS] +``` + +**Arguments:** +- `GLOB_PATTERN`: Pattern to match images (e.g., "*.png", "dir/*.jpg") +- `OUTPUT_FILE`: Path to output markdown file (will be overwritten) +- `-v/-vv/-vvv`: Verbosity (INFO/DEBUG/TRACE) +- `--help`: Show comprehensive help with examples +- `--version`: Show version + +**Examples:** +```bash +# Basic: Process all PNGs in current directory +ollama-deepseek-ocr-tool "*.png" output.md + +# Process specific directory +ollama-deepseek-ocr-tool "textbook-ch3/*.jpg" chapter-3.md + +# With verbose logging +ollama-deepseek-ocr-tool "*.png" output.md -vv + +# Preview help (shows all examples) +ollama-deepseek-ocr-tool --help +``` + +**Output Format:** +```markdown + + +[extracted text from image 1] + +--- + + + +[extracted text from image 2] +``` + + + +
+⚙️ Advanced Features (Click to expand) + + + +### Multi-Level Verbosity Logging + +Control logging detail with progressive verbosity levels. All logs output to stderr. + +**Logging Levels:** + +| Flag | Level | Output | Use Case | +|------|-------|--------|----------| +| (none) | WARNING | Errors and warnings only | Production, quiet mode | +| `-v` | INFO | + High-level operations | Normal debugging | +| `-vv` | DEBUG | + Detailed info, full tracebacks | Development, troubleshooting | +| `-vvv` | TRACE | + Library internals | Deep debugging | + +**Examples:** +```bash +# INFO level - see operations +ollama-deepseek-ocr-tool command -v + +# DEBUG level - see detailed info +ollama-deepseek-ocr-tool command -vv + +# TRACE level - see all internals +ollama-deepseek-ocr-tool command -vvv +``` + +--- + +### What Can Be Extracted + +**Text & Formatting:** +- ✅ Headings (H1, H2, H3) +- ✅ Body text with bold/italic +- ✅ Bulleted and numbered lists +- ✅ Multi-column layouts + +**Tables:** +- ✅ Clean markdown table format +- ✅ Headers and structure preserved +- ✅ Merged cells handled + +**Diagrams & Figures:** +- ✅ Text labels extracted +- ✅ Figure captions captured +- ❌ Visual content not described +- ❌ Flowchart arrows not preserved + +### Performance Characteristics + +- **Speed**: ~3 seconds per image (M4 MacBook) +- **Memory**: ~6GB (DeepSeek-OCR model) +- **Throughput**: ~20 images per minute +- **Scalability**: Sequential processing (no parallel batching) + +
+ +
+🔧 Troubleshooting (Click to expand) + +### Common Issues + +**Issue: "No files match pattern"** +```bash +# Check your glob pattern and current directory +ls *.png # Verify files exist + +# Use absolute or relative paths correctly +ollama-deepseek-ocr-tool "./images/*.png" output.md +``` + +**Issue: "Connection refused" / "OCR extraction failed"** +```bash +# Ensure Ollama is running +ollama serve + +# Verify model is installed +ollama list | grep deepseek-ocr + +# Pull model if missing +ollama pull deepseek-ocr +``` + +**Issue: Poor quality extraction** +- Use `-vv` flag to see word counts and verify extraction +- Check image quality (resolution, clarity) +- For complex layouts, results may vary +- Tables and diagrams work best with clear text + +**Issue: Slow processing** +- Expected: ~3 seconds per image on M4 +- Check if Ollama is using GPU acceleration +- Sequential processing is by design (6GB model) + +### Getting Help + +```bash +# Show comprehensive help with examples +ollama-deepseek-ocr-tool --help + +# Use verbose logging to debug +ollama-deepseek-ocr-tool "*.png" output.md -vv +``` + +
+ +## Exit Codes + +- `0`: Success - all images processed +- `1`: Validation error - no files match pattern or invalid arguments +- `2`: Runtime error - Ollama connection failed or model not found + +## Best Practices + +1. **Organize images before processing**: Name files sequentially (IMG_001, IMG_002) for natural sorting +2. **Use descriptive output names**: `chapter-3-entrepreneurship.md` not `output.md` +3. **Start with small batches**: Test with 2-3 images first to verify quality +4. **Enable verbose logging for debugging**: Use `-vv` to see extraction progress and word counts +5. **Review output after processing**: OCR may miss formatting or misread complex layouts +6. **Keep images at good resolution**: Higher quality = better extraction +7. **Process similar content together**: Keep textbook pages separate from diagrams + +## Resources + +- **GitHub**: https://github.com/dnvriend/ollama-deepseek-ocr-tool +- **Python Package Index**: https://pypi.org/project/ollama-deepseek-ocr-tool/ +- **Documentation**: