Initial commit
This commit is contained in:
626
skills/pdf-processing-pro/TABLES.md
Normal file
626
skills/pdf-processing-pro/TABLES.md
Normal file
@@ -0,0 +1,626 @@
|
||||
# PDF Table Extraction Guide
|
||||
|
||||
Advanced table extraction strategies for production environments.
|
||||
|
||||
## Table of contents
|
||||
|
||||
- Basic table extraction
|
||||
- Multi-page tables
|
||||
- Complex table structures
|
||||
- Export formats
|
||||
- Table detection algorithms
|
||||
- Custom extraction rules
|
||||
- Performance optimization
|
||||
- Production examples
|
||||
|
||||
## Basic table extraction
|
||||
|
||||
### Using pdfplumber (recommended)
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("report.pdf") as pdf:
|
||||
page = pdf.pages[0]
|
||||
tables = page.extract_tables()
|
||||
|
||||
for i, table in enumerate(tables):
|
||||
print(f"\nTable {i + 1}:")
|
||||
for row in table:
|
||||
print(row)
|
||||
```
|
||||
|
||||
### Using included script
|
||||
|
||||
```bash
|
||||
python scripts/extract_tables.py report.pdf --output tables.csv
|
||||
```
|
||||
|
||||
Output:
|
||||
```csv
|
||||
Name,Age,City
|
||||
John Doe,30,New York
|
||||
Jane Smith,25,Los Angeles
|
||||
Bob Johnson,35,Chicago
|
||||
```
|
||||
|
||||
## Table extraction strategies
|
||||
|
||||
### Strategy 1: Automatic detection
|
||||
|
||||
Let pdfplumber auto-detect tables:
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
tables = page.extract_tables()
|
||||
|
||||
if tables:
|
||||
print(f"Found {len(tables)} table(s) on page {page_num}")
|
||||
|
||||
for table_num, table in enumerate(tables, 1):
|
||||
print(f"\nTable {table_num}:")
|
||||
# First row is usually headers
|
||||
headers = table[0]
|
||||
print(f"Columns: {headers}")
|
||||
|
||||
# Data rows
|
||||
for row in table[1:]:
|
||||
print(row)
|
||||
```
|
||||
|
||||
### Strategy 2: Custom table settings
|
||||
|
||||
Fine-tune detection with custom settings:
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
table_settings = {
|
||||
"vertical_strategy": "lines", # or "text", "lines_strict"
|
||||
"horizontal_strategy": "lines",
|
||||
"explicit_vertical_lines": [],
|
||||
"explicit_horizontal_lines": [],
|
||||
"snap_tolerance": 3,
|
||||
"join_tolerance": 3,
|
||||
"edge_min_length": 3,
|
||||
"min_words_vertical": 3,
|
||||
"min_words_horizontal": 1,
|
||||
"keep_blank_chars": False,
|
||||
"text_tolerance": 3,
|
||||
"text_x_tolerance": 3,
|
||||
"text_y_tolerance": 3,
|
||||
"intersection_tolerance": 3
|
||||
}
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
page = pdf.pages[0]
|
||||
tables = page.extract_tables(table_settings=table_settings)
|
||||
```
|
||||
|
||||
### Strategy 3: Explicit boundaries
|
||||
|
||||
Define table boundaries manually:
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
page = pdf.pages[0]
|
||||
|
||||
# Define bounding box (x0, top, x1, bottom)
|
||||
bbox = (50, 100, 550, 700)
|
||||
|
||||
# Extract table within bounding box
|
||||
cropped = page.within_bbox(bbox)
|
||||
tables = cropped.extract_tables()
|
||||
```
|
||||
|
||||
## Multi-page tables
|
||||
|
||||
### Detect and merge multi-page tables
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
def extract_multipage_table(pdf_path, start_page=0, end_page=None):
|
||||
"""Extract table that spans multiple pages."""
|
||||
|
||||
all_rows = []
|
||||
headers = None
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
pages = pdf.pages[start_page:end_page]
|
||||
|
||||
for page_num, page in enumerate(pages):
|
||||
tables = page.extract_tables()
|
||||
|
||||
if not tables:
|
||||
continue
|
||||
|
||||
# Assume first table on page
|
||||
table = tables[0]
|
||||
|
||||
if page_num == 0:
|
||||
# First page: capture headers and data
|
||||
headers = table[0]
|
||||
all_rows.extend(table[1:])
|
||||
else:
|
||||
# Subsequent pages: skip headers if they repeat
|
||||
if table[0] == headers:
|
||||
all_rows.extend(table[1:])
|
||||
else:
|
||||
all_rows.extend(table)
|
||||
|
||||
return [headers] + all_rows if headers else all_rows
|
||||
|
||||
# Usage
|
||||
table = extract_multipage_table("report.pdf", start_page=2, end_page=5)
|
||||
|
||||
print(f"Extracted {len(table) - 1} rows")
|
||||
print(f"Columns: {table[0]}")
|
||||
```
|
||||
|
||||
## Complex table structures
|
||||
|
||||
### Handling merged cells
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
def handle_merged_cells(table):
|
||||
"""Process table with merged cells."""
|
||||
|
||||
processed = []
|
||||
|
||||
for row in table:
|
||||
new_row = []
|
||||
last_value = None
|
||||
|
||||
for cell in row:
|
||||
if cell is None or cell == "":
|
||||
# Merged cell - use value from left
|
||||
new_row.append(last_value)
|
||||
else:
|
||||
new_row.append(cell)
|
||||
last_value = cell
|
||||
|
||||
processed.append(new_row)
|
||||
|
||||
return processed
|
||||
|
||||
# Usage
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
table = pdf.pages[0].extract_tables()[0]
|
||||
clean_table = handle_merged_cells(table)
|
||||
```
|
||||
|
||||
### Nested tables
|
||||
|
||||
```python
|
||||
def extract_nested_tables(page, bbox):
|
||||
"""Extract nested tables from a region."""
|
||||
|
||||
cropped = page.within_bbox(bbox)
|
||||
|
||||
# Try to detect sub-regions with tables
|
||||
tables = cropped.extract_tables()
|
||||
|
||||
result = []
|
||||
for table in tables:
|
||||
# Process each nested table
|
||||
if table:
|
||||
result.append({
|
||||
"type": "nested",
|
||||
"data": table
|
||||
})
|
||||
|
||||
return result
|
||||
```
|
||||
|
||||
### Tables with varying column counts
|
||||
|
||||
```python
|
||||
def normalize_table_columns(table):
|
||||
"""Normalize table with inconsistent column counts."""
|
||||
|
||||
if not table:
|
||||
return table
|
||||
|
||||
# Find max column count
|
||||
max_cols = max(len(row) for row in table)
|
||||
|
||||
# Pad short rows
|
||||
normalized = []
|
||||
for row in table:
|
||||
if len(row) < max_cols:
|
||||
# Pad with empty strings
|
||||
row = row + [""] * (max_cols - len(row))
|
||||
normalized.append(row)
|
||||
|
||||
return normalized
|
||||
```
|
||||
|
||||
## Export formats
|
||||
|
||||
### Export to CSV
|
||||
|
||||
```python
|
||||
import csv
|
||||
|
||||
def export_to_csv(table, output_path):
|
||||
"""Export table to CSV."""
|
||||
|
||||
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerows(table)
|
||||
|
||||
# Usage
|
||||
table = extract_table("report.pdf")
|
||||
export_to_csv(table, "output.csv")
|
||||
```
|
||||
|
||||
### Export to Excel
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
def export_to_excel(tables, output_path):
|
||||
"""Export multiple tables to Excel with sheets."""
|
||||
|
||||
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
||||
for i, table in enumerate(tables):
|
||||
if not table:
|
||||
continue
|
||||
|
||||
# Convert to DataFrame
|
||||
headers = table[0]
|
||||
data = table[1:]
|
||||
df = pd.DataFrame(data, columns=headers)
|
||||
|
||||
# Write to sheet
|
||||
sheet_name = f"Table_{i + 1}"
|
||||
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
||||
|
||||
# Auto-adjust column widths
|
||||
worksheet = writer.sheets[sheet_name]
|
||||
for column in worksheet.columns:
|
||||
max_length = 0
|
||||
column_letter = column[0].column_letter
|
||||
for cell in column:
|
||||
if len(str(cell.value)) > max_length:
|
||||
max_length = len(str(cell.value))
|
||||
worksheet.column_dimensions[column_letter].width = max_length + 2
|
||||
|
||||
# Usage
|
||||
tables = extract_all_tables("report.pdf")
|
||||
export_to_excel(tables, "output.xlsx")
|
||||
```
|
||||
|
||||
### Export to JSON
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
def export_to_json(table, output_path):
|
||||
"""Export table to JSON."""
|
||||
|
||||
if not table:
|
||||
return
|
||||
|
||||
headers = table[0]
|
||||
data = table[1:]
|
||||
|
||||
# Convert to list of dictionaries
|
||||
records = []
|
||||
for row in data:
|
||||
record = {}
|
||||
for i, header in enumerate(headers):
|
||||
value = row[i] if i < len(row) else None
|
||||
record[header] = value
|
||||
records.append(record)
|
||||
|
||||
# Save to JSON
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
|
||||
# Usage
|
||||
table = extract_table("report.pdf")
|
||||
export_to_json(table, "output.json")
|
||||
```
|
||||
|
||||
## Table detection algorithms
|
||||
|
||||
### Visual debugging
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
def visualize_table_detection(pdf_path, page_num=0, output_path="debug.png"):
|
||||
"""Visualize detected table structure."""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
page = pdf.pages[page_num]
|
||||
|
||||
# Draw detected table lines
|
||||
im = page.to_image(resolution=150)
|
||||
im = im.debug_tablefinder()
|
||||
im.save(output_path)
|
||||
|
||||
print(f"Saved debug image to {output_path}")
|
||||
|
||||
# Usage
|
||||
visualize_table_detection("document.pdf", page_num=0)
|
||||
```
|
||||
|
||||
### Algorithm: Line-based detection
|
||||
|
||||
Best for tables with visible borders:
|
||||
|
||||
```python
|
||||
table_settings = {
|
||||
"vertical_strategy": "lines",
|
||||
"horizontal_strategy": "lines"
|
||||
}
|
||||
|
||||
tables = page.extract_tables(table_settings=table_settings)
|
||||
```
|
||||
|
||||
### Algorithm: Text-based detection
|
||||
|
||||
Best for tables without borders:
|
||||
|
||||
```python
|
||||
table_settings = {
|
||||
"vertical_strategy": "text",
|
||||
"horizontal_strategy": "text"
|
||||
}
|
||||
|
||||
tables = page.extract_tables(table_settings=table_settings)
|
||||
```
|
||||
|
||||
### Algorithm: Explicit lines
|
||||
|
||||
For complex layouts, define lines manually:
|
||||
|
||||
```python
|
||||
# Define vertical lines at x-coordinates
|
||||
vertical_lines = [50, 150, 250, 350, 450, 550]
|
||||
|
||||
# Define horizontal lines at y-coordinates
|
||||
horizontal_lines = [100, 130, 160, 190, 220, 250]
|
||||
|
||||
table_settings = {
|
||||
"explicit_vertical_lines": vertical_lines,
|
||||
"explicit_horizontal_lines": horizontal_lines
|
||||
}
|
||||
|
||||
tables = page.extract_tables(table_settings=table_settings)
|
||||
```
|
||||
|
||||
## Custom extraction rules
|
||||
|
||||
### Rule-based extraction
|
||||
|
||||
```python
|
||||
def extract_with_rules(page, rules):
|
||||
"""Extract table using custom rules."""
|
||||
|
||||
# Rule: "Headers are bold"
|
||||
if rules.get("bold_headers"):
|
||||
chars = page.chars
|
||||
bold_chars = [c for c in chars if "Bold" in c.get("fontname", "")]
|
||||
# Use bold chars to identify header row
|
||||
pass
|
||||
|
||||
# Rule: "First column is always left-aligned"
|
||||
if rules.get("left_align_first_col"):
|
||||
# Adjust extraction to respect alignment
|
||||
pass
|
||||
|
||||
# Rule: "Currency values in last column"
|
||||
if rules.get("currency_last_col"):
|
||||
# Parse currency format
|
||||
pass
|
||||
|
||||
# Extract with adjusted settings
|
||||
return page.extract_tables()
|
||||
```
|
||||
|
||||
### Post-processing rules
|
||||
|
||||
```python
|
||||
def apply_post_processing(table, rules):
|
||||
"""Apply post-processing rules to extracted table."""
|
||||
|
||||
processed = []
|
||||
|
||||
for row in table:
|
||||
new_row = []
|
||||
|
||||
for i, cell in enumerate(row):
|
||||
value = cell
|
||||
|
||||
# Rule: Strip whitespace
|
||||
if rules.get("strip_whitespace"):
|
||||
value = value.strip() if value else value
|
||||
|
||||
# Rule: Convert currency to float
|
||||
if rules.get("parse_currency") and i == len(row) - 1:
|
||||
if value and "$" in value:
|
||||
value = float(value.replace("$", "").replace(",", ""))
|
||||
|
||||
# Rule: Parse dates
|
||||
if rules.get("parse_dates") and i == 0:
|
||||
# Convert to datetime
|
||||
pass
|
||||
|
||||
new_row.append(value)
|
||||
|
||||
processed.append(new_row)
|
||||
|
||||
return processed
|
||||
```
|
||||
|
||||
## Performance optimization
|
||||
|
||||
### Process large PDFs efficiently
|
||||
|
||||
```python
|
||||
def extract_tables_optimized(pdf_path):
|
||||
"""Extract tables with memory optimization."""
|
||||
|
||||
import gc
|
||||
|
||||
results = []
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
print(f"Processing page {page_num + 1}/{len(pdf.pages)}")
|
||||
|
||||
# Extract tables from current page
|
||||
tables = page.extract_tables()
|
||||
results.extend(tables)
|
||||
|
||||
# Force garbage collection
|
||||
gc.collect()
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
### Parallel processing
|
||||
|
||||
```python
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
import pdfplumber
|
||||
|
||||
def extract_page_tables(args):
|
||||
"""Extract tables from a single page."""
|
||||
pdf_path, page_num = args
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
page = pdf.pages[page_num]
|
||||
return page.extract_tables()
|
||||
|
||||
def extract_tables_parallel(pdf_path, max_workers=4):
|
||||
"""Extract tables using multiple processes."""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
page_count = len(pdf.pages)
|
||||
|
||||
# Create tasks
|
||||
tasks = [(pdf_path, i) for i in range(page_count)]
|
||||
|
||||
# Process in parallel
|
||||
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
||||
results = list(executor.map(extract_page_tables, tasks))
|
||||
|
||||
# Flatten results
|
||||
all_tables = []
|
||||
for page_tables in results:
|
||||
all_tables.extend(page_tables)
|
||||
|
||||
return all_tables
|
||||
```
|
||||
|
||||
## Production examples
|
||||
|
||||
### Example 1: Financial report extraction
|
||||
|
||||
```python
|
||||
import pdfplumber
|
||||
import pandas as pd
|
||||
from decimal import Decimal
|
||||
|
||||
def extract_financial_tables(pdf_path):
|
||||
"""Extract financial data with proper number formatting."""
|
||||
|
||||
tables = []
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_tables = page.extract_tables()
|
||||
|
||||
for table in page_tables:
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame(table[1:], columns=table[0])
|
||||
|
||||
# Parse currency columns
|
||||
for col in df.columns:
|
||||
if df[col].str.contains("$", na=False).any():
|
||||
df[col] = df[col].str.replace(r"[$,()]", "", regex=True)
|
||||
df[col] = pd.to_numeric(df[col], errors="coerce")
|
||||
|
||||
tables.append(df)
|
||||
|
||||
return tables
|
||||
```
|
||||
|
||||
### Example 2: Batch table extraction
|
||||
|
||||
```python
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
def batch_extract_tables(input_dir, output_dir):
|
||||
"""Extract tables from all PDFs in directory."""
|
||||
|
||||
input_path = Path(input_dir)
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True)
|
||||
|
||||
for pdf_file in input_path.glob("*.pdf"):
|
||||
print(f"Processing: {pdf_file.name}")
|
||||
|
||||
try:
|
||||
# Extract tables
|
||||
tables = extract_all_tables(str(pdf_file))
|
||||
|
||||
# Export to Excel
|
||||
output_file = output_path / f"{pdf_file.stem}_tables.xlsx"
|
||||
export_to_excel(tables, str(output_file))
|
||||
|
||||
print(f" ✓ Extracted {len(tables)} table(s)")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
|
||||
# Usage
|
||||
batch_extract_tables("invoices/", "extracted/")
|
||||
```
|
||||
|
||||
## Best practices
|
||||
|
||||
1. **Visualize first**: Use debug mode to understand table structure
|
||||
2. **Test settings**: Try different strategies for best results
|
||||
3. **Handle errors**: PDFs vary widely in quality
|
||||
4. **Validate output**: Check extracted data makes sense
|
||||
5. **Post-process**: Clean and normalize extracted data
|
||||
6. **Use pandas**: Leverage DataFrame operations for analysis
|
||||
7. **Cache results**: Avoid re-processing large files
|
||||
8. **Monitor performance**: Profile for bottlenecks
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Tables not detected
|
||||
|
||||
1. Try different detection strategies
|
||||
2. Use visual debugging to see structure
|
||||
3. Define explicit lines manually
|
||||
4. Check if table is actually an image
|
||||
|
||||
### Incorrect cell values
|
||||
|
||||
1. Adjust snap/join tolerance
|
||||
2. Check text extraction quality
|
||||
3. Use post-processing to clean data
|
||||
4. Verify PDF is not scanned image
|
||||
|
||||
### Performance issues
|
||||
|
||||
1. Process pages individually
|
||||
2. Use parallel processing
|
||||
3. Reduce image resolution
|
||||
4. Extract only needed pages
|
||||
Reference in New Issue
Block a user