627 lines
15 KiB
Markdown
627 lines
15 KiB
Markdown
# PDF Table Extraction Guide
|
|
|
|
Advanced table extraction strategies for production environments.
|
|
|
|
## Table of contents
|
|
|
|
- Basic table extraction
|
|
- Multi-page tables
|
|
- Complex table structures
|
|
- Export formats
|
|
- Table detection algorithms
|
|
- Custom extraction rules
|
|
- Performance optimization
|
|
- Production examples
|
|
|
|
## Basic table extraction
|
|
|
|
### Using pdfplumber (recommended)
|
|
|
|
```python
|
|
import pdfplumber
|
|
|
|
with pdfplumber.open("report.pdf") as pdf:
|
|
page = pdf.pages[0]
|
|
tables = page.extract_tables()
|
|
|
|
for i, table in enumerate(tables):
|
|
print(f"\nTable {i + 1}:")
|
|
for row in table:
|
|
print(row)
|
|
```
|
|
|
|
### Using included script
|
|
|
|
```bash
|
|
python scripts/extract_tables.py report.pdf --output tables.csv
|
|
```
|
|
|
|
Output:
|
|
```csv
|
|
Name,Age,City
|
|
John Doe,30,New York
|
|
Jane Smith,25,Los Angeles
|
|
Bob Johnson,35,Chicago
|
|
```
|
|
|
|
## Table extraction strategies
|
|
|
|
### Strategy 1: Automatic detection
|
|
|
|
Let pdfplumber auto-detect tables:
|
|
|
|
```python
|
|
import pdfplumber
|
|
|
|
with pdfplumber.open("document.pdf") as pdf:
|
|
for page_num, page in enumerate(pdf.pages, 1):
|
|
tables = page.extract_tables()
|
|
|
|
if tables:
|
|
print(f"Found {len(tables)} table(s) on page {page_num}")
|
|
|
|
for table_num, table in enumerate(tables, 1):
|
|
print(f"\nTable {table_num}:")
|
|
# First row is usually headers
|
|
headers = table[0]
|
|
print(f"Columns: {headers}")
|
|
|
|
# Data rows
|
|
for row in table[1:]:
|
|
print(row)
|
|
```
|
|
|
|
### Strategy 2: Custom table settings
|
|
|
|
Fine-tune detection with custom settings:
|
|
|
|
```python
|
|
import pdfplumber
|
|
|
|
table_settings = {
|
|
"vertical_strategy": "lines", # or "text", "lines_strict"
|
|
"horizontal_strategy": "lines",
|
|
"explicit_vertical_lines": [],
|
|
"explicit_horizontal_lines": [],
|
|
"snap_tolerance": 3,
|
|
"join_tolerance": 3,
|
|
"edge_min_length": 3,
|
|
"min_words_vertical": 3,
|
|
"min_words_horizontal": 1,
|
|
"keep_blank_chars": False,
|
|
"text_tolerance": 3,
|
|
"text_x_tolerance": 3,
|
|
"text_y_tolerance": 3,
|
|
"intersection_tolerance": 3
|
|
}
|
|
|
|
with pdfplumber.open("document.pdf") as pdf:
|
|
page = pdf.pages[0]
|
|
tables = page.extract_tables(table_settings=table_settings)
|
|
```
|
|
|
|
### Strategy 3: Explicit boundaries
|
|
|
|
Define table boundaries manually:
|
|
|
|
```python
|
|
import pdfplumber
|
|
|
|
with pdfplumber.open("document.pdf") as pdf:
|
|
page = pdf.pages[0]
|
|
|
|
# Define bounding box (x0, top, x1, bottom)
|
|
bbox = (50, 100, 550, 700)
|
|
|
|
# Extract table within bounding box
|
|
cropped = page.within_bbox(bbox)
|
|
tables = cropped.extract_tables()
|
|
```
|
|
|
|
## Multi-page tables
|
|
|
|
### Detect and merge multi-page tables
|
|
|
|
```python
|
|
import pdfplumber
|
|
|
|
def extract_multipage_table(pdf_path, start_page=0, end_page=None):
|
|
"""Extract table that spans multiple pages."""
|
|
|
|
all_rows = []
|
|
headers = None
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
pages = pdf.pages[start_page:end_page]
|
|
|
|
for page_num, page in enumerate(pages):
|
|
tables = page.extract_tables()
|
|
|
|
if not tables:
|
|
continue
|
|
|
|
# Assume first table on page
|
|
table = tables[0]
|
|
|
|
if page_num == 0:
|
|
# First page: capture headers and data
|
|
headers = table[0]
|
|
all_rows.extend(table[1:])
|
|
else:
|
|
# Subsequent pages: skip headers if they repeat
|
|
if table[0] == headers:
|
|
all_rows.extend(table[1:])
|
|
else:
|
|
all_rows.extend(table)
|
|
|
|
return [headers] + all_rows if headers else all_rows
|
|
|
|
# Usage
|
|
table = extract_multipage_table("report.pdf", start_page=2, end_page=5)
|
|
|
|
print(f"Extracted {len(table) - 1} rows")
|
|
print(f"Columns: {table[0]}")
|
|
```
|
|
|
|
## Complex table structures
|
|
|
|
### Handling merged cells
|
|
|
|
```python
|
|
import pdfplumber
|
|
|
|
def handle_merged_cells(table):
|
|
"""Process table with merged cells."""
|
|
|
|
processed = []
|
|
|
|
for row in table:
|
|
new_row = []
|
|
last_value = None
|
|
|
|
for cell in row:
|
|
if cell is None or cell == "":
|
|
# Merged cell - use value from left
|
|
new_row.append(last_value)
|
|
else:
|
|
new_row.append(cell)
|
|
last_value = cell
|
|
|
|
processed.append(new_row)
|
|
|
|
return processed
|
|
|
|
# Usage
|
|
with pdfplumber.open("document.pdf") as pdf:
|
|
table = pdf.pages[0].extract_tables()[0]
|
|
clean_table = handle_merged_cells(table)
|
|
```
|
|
|
|
### Nested tables
|
|
|
|
```python
|
|
def extract_nested_tables(page, bbox):
|
|
"""Extract nested tables from a region."""
|
|
|
|
cropped = page.within_bbox(bbox)
|
|
|
|
# Try to detect sub-regions with tables
|
|
tables = cropped.extract_tables()
|
|
|
|
result = []
|
|
for table in tables:
|
|
# Process each nested table
|
|
if table:
|
|
result.append({
|
|
"type": "nested",
|
|
"data": table
|
|
})
|
|
|
|
return result
|
|
```
|
|
|
|
### Tables with varying column counts
|
|
|
|
```python
|
|
def normalize_table_columns(table):
|
|
"""Normalize table with inconsistent column counts."""
|
|
|
|
if not table:
|
|
return table
|
|
|
|
# Find max column count
|
|
max_cols = max(len(row) for row in table)
|
|
|
|
# Pad short rows
|
|
normalized = []
|
|
for row in table:
|
|
if len(row) < max_cols:
|
|
# Pad with empty strings
|
|
row = row + [""] * (max_cols - len(row))
|
|
normalized.append(row)
|
|
|
|
return normalized
|
|
```
|
|
|
|
## Export formats
|
|
|
|
### Export to CSV
|
|
|
|
```python
|
|
import csv
|
|
|
|
def export_to_csv(table, output_path):
|
|
"""Export table to CSV."""
|
|
|
|
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
|
writer = csv.writer(f)
|
|
writer.writerows(table)
|
|
|
|
# Usage
|
|
table = extract_table("report.pdf")
|
|
export_to_csv(table, "output.csv")
|
|
```
|
|
|
|
### Export to Excel
|
|
|
|
```python
|
|
import pandas as pd
|
|
|
|
def export_to_excel(tables, output_path):
|
|
"""Export multiple tables to Excel with sheets."""
|
|
|
|
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
|
for i, table in enumerate(tables):
|
|
if not table:
|
|
continue
|
|
|
|
# Convert to DataFrame
|
|
headers = table[0]
|
|
data = table[1:]
|
|
df = pd.DataFrame(data, columns=headers)
|
|
|
|
# Write to sheet
|
|
sheet_name = f"Table_{i + 1}"
|
|
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
|
|
# Auto-adjust column widths
|
|
worksheet = writer.sheets[sheet_name]
|
|
for column in worksheet.columns:
|
|
max_length = 0
|
|
column_letter = column[0].column_letter
|
|
for cell in column:
|
|
if len(str(cell.value)) > max_length:
|
|
max_length = len(str(cell.value))
|
|
worksheet.column_dimensions[column_letter].width = max_length + 2
|
|
|
|
# Usage
|
|
tables = extract_all_tables("report.pdf")
|
|
export_to_excel(tables, "output.xlsx")
|
|
```
|
|
|
|
### Export to JSON
|
|
|
|
```python
|
|
import json
|
|
|
|
def export_to_json(table, output_path):
|
|
"""Export table to JSON."""
|
|
|
|
if not table:
|
|
return
|
|
|
|
headers = table[0]
|
|
data = table[1:]
|
|
|
|
# Convert to list of dictionaries
|
|
records = []
|
|
for row in data:
|
|
record = {}
|
|
for i, header in enumerate(headers):
|
|
value = row[i] if i < len(row) else None
|
|
record[header] = value
|
|
records.append(record)
|
|
|
|
# Save to JSON
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(records, f, indent=2)
|
|
|
|
# Usage
|
|
table = extract_table("report.pdf")
|
|
export_to_json(table, "output.json")
|
|
```
|
|
|
|
## Table detection algorithms
|
|
|
|
### Visual debugging
|
|
|
|
```python
|
|
import pdfplumber
|
|
|
|
def visualize_table_detection(pdf_path, page_num=0, output_path="debug.png"):
|
|
"""Visualize detected table structure."""
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
page = pdf.pages[page_num]
|
|
|
|
# Draw detected table lines
|
|
im = page.to_image(resolution=150)
|
|
im = im.debug_tablefinder()
|
|
im.save(output_path)
|
|
|
|
print(f"Saved debug image to {output_path}")
|
|
|
|
# Usage
|
|
visualize_table_detection("document.pdf", page_num=0)
|
|
```
|
|
|
|
### Algorithm: Line-based detection
|
|
|
|
Best for tables with visible borders:
|
|
|
|
```python
|
|
table_settings = {
|
|
"vertical_strategy": "lines",
|
|
"horizontal_strategy": "lines"
|
|
}
|
|
|
|
tables = page.extract_tables(table_settings=table_settings)
|
|
```
|
|
|
|
### Algorithm: Text-based detection
|
|
|
|
Best for tables without borders:
|
|
|
|
```python
|
|
table_settings = {
|
|
"vertical_strategy": "text",
|
|
"horizontal_strategy": "text"
|
|
}
|
|
|
|
tables = page.extract_tables(table_settings=table_settings)
|
|
```
|
|
|
|
### Algorithm: Explicit lines
|
|
|
|
For complex layouts, define lines manually:
|
|
|
|
```python
|
|
# Define vertical lines at x-coordinates
|
|
vertical_lines = [50, 150, 250, 350, 450, 550]
|
|
|
|
# Define horizontal lines at y-coordinates
|
|
horizontal_lines = [100, 130, 160, 190, 220, 250]
|
|
|
|
table_settings = {
|
|
"explicit_vertical_lines": vertical_lines,
|
|
"explicit_horizontal_lines": horizontal_lines
|
|
}
|
|
|
|
tables = page.extract_tables(table_settings=table_settings)
|
|
```
|
|
|
|
## Custom extraction rules
|
|
|
|
### Rule-based extraction
|
|
|
|
```python
|
|
def extract_with_rules(page, rules):
|
|
"""Extract table using custom rules."""
|
|
|
|
# Rule: "Headers are bold"
|
|
if rules.get("bold_headers"):
|
|
chars = page.chars
|
|
bold_chars = [c for c in chars if "Bold" in c.get("fontname", "")]
|
|
# Use bold chars to identify header row
|
|
pass
|
|
|
|
# Rule: "First column is always left-aligned"
|
|
if rules.get("left_align_first_col"):
|
|
# Adjust extraction to respect alignment
|
|
pass
|
|
|
|
# Rule: "Currency values in last column"
|
|
if rules.get("currency_last_col"):
|
|
# Parse currency format
|
|
pass
|
|
|
|
# Extract with adjusted settings
|
|
return page.extract_tables()
|
|
```
|
|
|
|
### Post-processing rules
|
|
|
|
```python
|
|
def apply_post_processing(table, rules):
|
|
"""Apply post-processing rules to extracted table."""
|
|
|
|
processed = []
|
|
|
|
for row in table:
|
|
new_row = []
|
|
|
|
for i, cell in enumerate(row):
|
|
value = cell
|
|
|
|
# Rule: Strip whitespace
|
|
if rules.get("strip_whitespace"):
|
|
value = value.strip() if value else value
|
|
|
|
# Rule: Convert currency to float
|
|
if rules.get("parse_currency") and i == len(row) - 1:
|
|
if value and "$" in value:
|
|
value = float(value.replace("$", "").replace(",", ""))
|
|
|
|
# Rule: Parse dates
|
|
if rules.get("parse_dates") and i == 0:
|
|
# Convert to datetime
|
|
pass
|
|
|
|
new_row.append(value)
|
|
|
|
processed.append(new_row)
|
|
|
|
return processed
|
|
```
|
|
|
|
## Performance optimization
|
|
|
|
### Process large PDFs efficiently
|
|
|
|
```python
|
|
def extract_tables_optimized(pdf_path):
|
|
"""Extract tables with memory optimization."""
|
|
|
|
import gc
|
|
|
|
results = []
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
for page_num, page in enumerate(pdf.pages):
|
|
print(f"Processing page {page_num + 1}/{len(pdf.pages)}")
|
|
|
|
# Extract tables from current page
|
|
tables = page.extract_tables()
|
|
results.extend(tables)
|
|
|
|
# Force garbage collection
|
|
gc.collect()
|
|
|
|
return results
|
|
```
|
|
|
|
### Parallel processing
|
|
|
|
```python
|
|
from concurrent.futures import ProcessPoolExecutor
|
|
import pdfplumber
|
|
|
|
def extract_page_tables(args):
|
|
"""Extract tables from a single page."""
|
|
pdf_path, page_num = args
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
page = pdf.pages[page_num]
|
|
return page.extract_tables()
|
|
|
|
def extract_tables_parallel(pdf_path, max_workers=4):
|
|
"""Extract tables using multiple processes."""
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
page_count = len(pdf.pages)
|
|
|
|
# Create tasks
|
|
tasks = [(pdf_path, i) for i in range(page_count)]
|
|
|
|
# Process in parallel
|
|
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
results = list(executor.map(extract_page_tables, tasks))
|
|
|
|
# Flatten results
|
|
all_tables = []
|
|
for page_tables in results:
|
|
all_tables.extend(page_tables)
|
|
|
|
return all_tables
|
|
```
|
|
|
|
## Production examples
|
|
|
|
### Example 1: Financial report extraction
|
|
|
|
```python
|
|
import pdfplumber
|
|
import pandas as pd
|
|
from decimal import Decimal
|
|
|
|
def extract_financial_tables(pdf_path):
|
|
"""Extract financial data with proper number formatting."""
|
|
|
|
tables = []
|
|
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
for page in pdf.pages:
|
|
page_tables = page.extract_tables()
|
|
|
|
for table in page_tables:
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(table[1:], columns=table[0])
|
|
|
|
# Parse currency columns
|
|
for col in df.columns:
|
|
if df[col].str.contains("$", na=False).any():
|
|
df[col] = df[col].str.replace(r"[$,()]", "", regex=True)
|
|
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
|
|
tables.append(df)
|
|
|
|
return tables
|
|
```
|
|
|
|
### Example 2: Batch table extraction
|
|
|
|
```python
|
|
import glob
|
|
from pathlib import Path
|
|
|
|
def batch_extract_tables(input_dir, output_dir):
|
|
"""Extract tables from all PDFs in directory."""
|
|
|
|
input_path = Path(input_dir)
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(exist_ok=True)
|
|
|
|
for pdf_file in input_path.glob("*.pdf"):
|
|
print(f"Processing: {pdf_file.name}")
|
|
|
|
try:
|
|
# Extract tables
|
|
tables = extract_all_tables(str(pdf_file))
|
|
|
|
# Export to Excel
|
|
output_file = output_path / f"{pdf_file.stem}_tables.xlsx"
|
|
export_to_excel(tables, str(output_file))
|
|
|
|
print(f" ✓ Extracted {len(tables)} table(s)")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error: {e}")
|
|
|
|
# Usage
|
|
batch_extract_tables("invoices/", "extracted/")
|
|
```
|
|
|
|
## Best practices
|
|
|
|
1. **Visualize first**: Use debug mode to understand table structure
|
|
2. **Test settings**: Try different strategies for best results
|
|
3. **Handle errors**: PDFs vary widely in quality
|
|
4. **Validate output**: Check extracted data makes sense
|
|
5. **Post-process**: Clean and normalize extracted data
|
|
6. **Use pandas**: Leverage DataFrame operations for analysis
|
|
7. **Cache results**: Avoid re-processing large files
|
|
8. **Monitor performance**: Profile for bottlenecks
|
|
|
|
## Troubleshooting
|
|
|
|
### Tables not detected
|
|
|
|
1. Try different detection strategies
|
|
2. Use visual debugging to see structure
|
|
3. Define explicit lines manually
|
|
4. Check if table is actually an image
|
|
|
|
### Incorrect cell values
|
|
|
|
1. Adjust snap/join tolerance
|
|
2. Check text extraction quality
|
|
3. Use post-processing to clean data
|
|
4. Verify PDF is not scanned image
|
|
|
|
### Performance issues
|
|
|
|
1. Process pages individually
|
|
2. Use parallel processing
|
|
3. Reduce image resolution
|
|
4. Extract only needed pages
|