commit 07c3046c05556de17df34729e9665a3bbf1eca5e Author: Zhongwei Li Date: Sat Nov 29 18:20:43 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..5f06ad7 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "docx-smart-extractor", + "description": "Extract and analyze Word documents (1MB-50MB+) with minimal token usage. Lossless extraction of all text, tables, formatting, and document structure while achieving 10-50x token reduction through local extraction, semantic chunking by headings, and intelligent caching.", + "version": "2.2.0", + "author": { + "name": "Diego Consolini", + "email": "diego@diegocon.nl" + }, + "agents": [ + "./agents/docx-smart-extractor.md" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d5961b9 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# docx-smart-extractor + +Extract and analyze Word documents (1MB-50MB+) with minimal token usage. Lossless extraction of all text, tables, formatting, and document structure while achieving 10-50x token reduction through local extraction, semantic chunking by headings, and intelligent caching. diff --git a/agents/docx-smart-extractor.md b/agents/docx-smart-extractor.md new file mode 100644 index 0000000..21c6b89 --- /dev/null +++ b/agents/docx-smart-extractor.md @@ -0,0 +1,214 @@ +--- +name: docx-smart-extractor +description: Extract and analyze Word documents (1MB-50MB+) with minimal token usage through local extraction, semantic chunking by headings, and intelligent caching. +capabilities: ["word-extraction", "table-extraction", "heading-structure", "token-optimization", "document-analysis", "policy-documents", "contract-analysis", "technical-reports"] +tools: Read, Bash +model: inherit +--- + +# DOCX Smart Extractor Agent + +## Overview + +The DOCX Smart Extractor enables efficient analysis of Word documents through local extraction, semantic chunking, and intelligent caching. Extract once, query forever. + +## Capabilities + +### Document Extraction +- **Complete text extraction** - All paragraphs with hierarchy preservation +- **Table extraction** - Full table structure, cells, and content +- **Formatting preservation** - Bold, italic, fonts, colors, styles +- **Document metadata** - Author, title, created date, modified date +- **Heading structure** - H1, H2, H3 hierarchy for navigation +- **Comments and tracked changes** - Full change history +- **Headers and footers** - Page-level content +- **Hyperlinks** - URL extraction and context + +### Semantic Chunking +- **By heading hierarchy** - Chunk at H1, H2, H3 boundaries +- **By paragraph groups** - 10-20 paragraphs per chunk +- **By tables** - Each table as separate chunk +- **Target chunk size** - 500-2000 tokens +- **No BS metrics** - Honest, verifiable features only + +### Query Capabilities +- **Keyword search** - Fast text search across all chunks +- **Heading lookup** - Get specific sections by heading +- **Table access** - Direct table extraction +- **Document summary** - Metadata and statistics + +## When to Use + +Use this plugin for: +- Policy documents (security, privacy, compliance) +- Technical reports and documentation +- Large Word documents (>1MB, >50 pages) +- Documents with clear heading structure +- Documents with tables and structured data +- Contract review and analysis +- Meeting notes and specifications + +## Workflow + +1. **Extract document** + ```bash + # Extract to cache (default) + python scripts/extract_docx.py /path/to/document.docx + + # Extract and copy to working directory (interactive prompt) + python scripts/extract_docx.py /path/to/document.docx + # Will prompt: "Copy files? (y/n)" + # Will ask: "Keep cache? (y/n)" + + # Extract and copy to specific directory (no prompts) + python scripts/extract_docx.py /path/to/document.docx --output-dir ./extracted + ``` + Output: Cache key (e.g., `policy_document_a8f9e2c1`) + +2. **Chunk content** + ```bash + python scripts/semantic_chunker.py policy_document_a8f9e2c1 + ``` + +3. **Query content** + ```bash + # Search for keyword + python scripts/query_docx.py search policy_document_a8f9e2c1 "data retention" + + # Get specific heading + python scripts/query_docx.py heading policy_document_a8f9e2c1 "Security Controls" + + # Get summary + python scripts/query_docx.py summary policy_document_a8f9e2c1 + ``` + +## Token Reduction + +Typical reductions: +- Small documents (< 50 paragraphs): 5-10x +- Medium documents (50-200 paragraphs): 10-30x +- Large documents (200+ paragraphs): 30-50x + +## Persistent Caching (v2.0.0 Unified System) + +**⚠️ IMPORTANT: Cache Location** + +Extracted content is stored in a **user cache directory**, NOT the working directory: + +**Cache locations by platform:** +- **Linux/Mac:** `~/.claude-cache/docx/{document_name}_{hash}/` +- **Windows:** `C:\Users\{username}\.claude-cache\docx\{document_name}_{hash}\` + +**Why cache directory?** +1. **Persistent caching:** Extract once, query forever - even across different projects +2. **Cross-project reuse:** Same document analyzed from different projects uses the same cache +3. **Performance:** Subsequent queries are instant (no re-extraction needed) +4. **Token optimization:** 10-50x reduction by loading only relevant sections + +**Cache contents:** +- `full_document.json` - Complete document text with formatting +- `headings.json` - Document heading structure +- `tables.json` - Extracted tables +- `metadata.json` - Document metadata +- `manifest.json` - Cache manifest + +**Accessing cached content:** +```bash +# List all cached documents +python scripts/query_docx.py list + +# Query cached content +python scripts/query_docx.py search {cache_key} "your query" + +# Find cache location (shown in extraction output) +# Example: ~/.claude-cache/docx/policy_document_a1b2c3d4/ +``` + +**If you need files in working directory:** +```bash +# Option 1: Use --output-dir flag during extraction +python scripts/extract_docx.py document.docx --output-dir ./extracted + +# Option 2: Copy from cache manually +cp -r ~/.claude-cache/docx/{cache_key}/* ./extracted_content/ +``` + +**Note:** Cache is local and not meant for version control. Keep original Word files in your repo and let each developer extract locally (one-time operation). + +## Supported Formats + +- ✅ .docx (Word 2007+ XML format) +- ✅ .docm (Macro-enabled Word documents) +- ❌ .doc (Legacy Word 97-2003 format - convert to .docx first) + +## Limitations + +- VBA macros not extracted (design choice) +- Images extracted as metadata only (position, size, alt text) +- Charts not extracted (recommend screenshot approach) +- Password-protected files cannot be opened +- Embedded objects may not be fully extracted + +## Dependencies + +- Python >= 3.8 +- python-docx >= 0.8.11 + +## Example Use Cases + +### Policy Document Analysis +```bash +# Extract +python scripts/extract_docx.py InfoSecPolicy.docx + +# Chunk +python scripts/semantic_chunker.py InfoSecPolicy_a8f9e2 + +# Find password policy section +python scripts/query_docx.py search InfoSecPolicy_a8f9e2 "password requirements" +``` + +### Contract Review +```bash +# Extract +python scripts/extract_docx.py Vendor_Contract.docx + +# Get specific section +python scripts/query_docx.py heading Vendor_Contract_f3a8c1 "Termination Clause" +``` + +### Technical Documentation +```bash +# Extract large spec document +python scripts/extract_docx.py API_Specification.docx + +# Search for endpoint details +python scripts/query_docx.py search API_Specification_b9d2e1 "authentication endpoint" +``` + +## Performance + +- **Extraction speed**: ~1-5 seconds for typical documents (1-10MB) +- **Chunking speed**: <1 second +- **Query speed**: <1 second +- **Cache reuse**: Instant (no re-extraction needed) + +## Output Format + +All output is JSON with UTF-8 encoding. Structured for easy parsing and analysis. + +## No Marketing BS + +This plugin does NOT: +- Claim "100% content preservation" (meaningless metric) +- Use AI during extraction (all local python-docx) +- Require internet connection +- Modify original documents +- Extract content you don't need + +What it DOES: +- Extract all text, tables, and formatting +- Chunk by semantic boundaries (headings) +- Enable fast keyword search +- Cache for instant reuse +- Achieve 10-50x token reduction (verified) diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..0bb1f73 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,45 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:diegocconsolini/ClaudeSkillCollection:docx-smart-extractor", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "565f6627613c4011d67057879156284e5517cf14", + "treeHash": "1e87a4583d1a017840c40cc18a4f05427b24f4579ec6696ac70d31d79467ba76", + "generatedAt": "2025-11-28T10:16:26.661806Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "docx-smart-extractor", + "description": "Extract and analyze Word documents (1MB-50MB+) with minimal token usage. Lossless extraction of all text, tables, formatting, and document structure while achieving 10-50x token reduction through local extraction, semantic chunking by headings, and intelligent caching.", + "version": "2.2.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "9d02a5d38bc58f03f710a8b70d8ccf3f8c5ef209d07aa8be4a8e113827ab52cd" + }, + { + "path": "agents/docx-smart-extractor.md", + "sha256": "aa025bbed600ab07a415030c60db4ee0c358d619b3d76c5bad41b38031639467" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "ca28c7ceb9dc8e7b72e924678351d37d7b5a6b179662ffe4a51a5810f5017e4d" + } + ], + "dirSha256": "1e87a4583d1a017840c40cc18a4f05427b24f4579ec6696ac70d31d79467ba76" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file