Files
gh-brunoasm-my-claude-skill…/skills/extract_from_pdfs/scripts/04_repair_json.py
2025-11-29 18:02:40 +08:00

228 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""
Repair and validate JSON extractions using json_repair library.
Handles common JSON parsing issues and validates against schema.
"""
import argparse
import json
from pathlib import Path
from typing import Dict, Any, Optional
import jsonschema
try:
from json_repair import repair_json
JSON_REPAIR_AVAILABLE = True
except ImportError:
JSON_REPAIR_AVAILABLE = False
print("Warning: json_repair not installed. Install with: pip install json-repair")
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description='Repair and validate JSON extractions'
)
parser.add_argument(
'--input',
required=True,
help='Input JSON file with extraction results from step 03'
)
parser.add_argument(
'--output',
default='cleaned_extractions.json',
help='Output JSON file with cleaned results'
)
parser.add_argument(
'--schema',
help='Optional: JSON schema file for validation'
)
parser.add_argument(
'--strict',
action='store_true',
help='Strict mode: reject records that fail validation'
)
return parser.parse_args()
def load_results(input_path: Path) -> Dict:
"""Load extraction results from JSON file"""
with open(input_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_schema(schema_path: Path) -> Dict:
"""Load JSON schema for validation"""
with open(schema_path, 'r', encoding='utf-8') as f:
schema_data = json.load(f)
return schema_data.get('output_schema', schema_data)
def save_results(results: Dict, output_path: Path):
"""Save cleaned results to JSON file"""
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
def repair_json_data(data: Any) -> tuple[Any, bool]:
"""
Attempt to repair JSON data using json_repair library.
Returns (repaired_data, success)
"""
if not JSON_REPAIR_AVAILABLE:
return data, True # Skip repair if library not available
try:
# Convert to JSON string and back to repair
json_str = json.dumps(data)
repaired_str = repair_json(json_str, return_objects=False)
repaired_data = json.loads(repaired_str)
return repaired_data, True
except Exception as e:
print(f"Failed to repair JSON: {e}")
return data, False
def validate_against_schema(data: Any, schema: Dict) -> tuple[bool, Optional[str]]:
"""
Validate data against JSON schema.
Returns (is_valid, error_message)
"""
try:
jsonschema.validate(instance=data, schema=schema)
return True, None
except jsonschema.exceptions.ValidationError as e:
return False, str(e)
except Exception as e:
return False, f"Validation error: {str(e)}"
def clean_extraction_result(
result: Dict,
schema: Optional[Dict] = None,
strict: bool = False
) -> Dict:
"""
Clean and validate a single extraction result.
Returns updated result with:
- repaired_data: Repaired JSON if repair was needed
- validation_status: 'valid', 'invalid', or 'repaired'
- validation_errors: List of validation errors if any
"""
if result.get('status') != 'success':
return result # Skip non-successful results
extracted_data = result.get('extracted_data')
if not extracted_data:
result['validation_status'] = 'invalid'
result['validation_errors'] = ['No extracted data found']
if strict:
result['status'] = 'failed_validation'
return result
# Try to repair JSON
repaired_data, repair_success = repair_json_data(extracted_data)
# Validate against schema if provided
validation_errors = []
if schema:
is_valid, error_msg = validate_against_schema(repaired_data, schema)
if not is_valid:
validation_errors.append(error_msg)
if strict:
result['status'] = 'failed_validation'
# Update result
if repaired_data != extracted_data and repair_success:
result['extracted_data'] = repaired_data
result['validation_status'] = 'repaired'
elif validation_errors:
result['validation_status'] = 'invalid'
else:
result['validation_status'] = 'valid'
if validation_errors:
result['validation_errors'] = validation_errors
return result
def main():
args = parse_args()
# Load inputs
results = load_results(Path(args.input))
print(f"Loaded {len(results)} extraction results")
schema = None
if args.schema:
schema = load_schema(Path(args.schema))
print(f"Loaded validation schema from {args.schema}")
# Clean each result
cleaned_results = {}
stats = {
'total': len(results),
'valid': 0,
'repaired': 0,
'invalid': 0,
'failed': 0
}
for record_id, result in results.items():
cleaned_result = clean_extraction_result(result, schema, args.strict)
cleaned_results[record_id] = cleaned_result
# Update statistics
if cleaned_result.get('status') == 'success':
status = cleaned_result.get('validation_status', 'unknown')
if status == 'valid':
stats['valid'] += 1
elif status == 'repaired':
stats['repaired'] += 1
elif status == 'invalid':
stats['invalid'] += 1
else:
stats['failed'] += 1
# Save cleaned results
output_path = Path(args.output)
save_results(cleaned_results, output_path)
# Print summary
print(f"\n{'='*60}")
print("JSON Repair and Validation Summary")
print(f"{'='*60}")
print(f"Total records: {stats['total']}")
print(f"Valid JSON: {stats['valid']}")
print(f"Repaired JSON: {stats['repaired']}")
print(f"Invalid JSON: {stats['invalid']}")
print(f"Failed extractions: {stats['failed']}")
if schema:
validation_rate = (stats['valid'] + stats['repaired']) / stats['total'] * 100
print(f"\nValidation rate: {validation_rate:.1f}%")
print(f"\nCleaned results saved to: {output_path}")
# Print examples of validation errors
if stats['invalid'] > 0:
print(f"\nShowing first 3 validation errors:")
error_count = 0
for record_id, result in cleaned_results.items():
if result.get('validation_errors'):
print(f"\n{record_id}:")
for error in result['validation_errors'][:2]:
print(f" - {error[:200]}")
error_count += 1
if error_count >= 3:
break
print(f"\nNext step: Validate and enrich data with external APIs")
if __name__ == '__main__':
main()