Initial commit
This commit is contained in:
345
skills/extract_from_pdfs/scripts/06_export_database.py
Normal file
345
skills/extract_from_pdfs/scripts/06_export_database.py
Normal file
@@ -0,0 +1,345 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Export validated data to various analysis formats.
|
||||
Supports Python (pandas/SQLite), R (RDS/CSV), Excel, and more.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import csv
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any
|
||||
import sys
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""Parse command line arguments"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Export validated data to analysis format'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--input',
|
||||
required=True,
|
||||
help='Input JSON file with validated data from step 05'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['python', 'r', 'csv', 'json', 'excel', 'sqlite'],
|
||||
required=True,
|
||||
help='Output format'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
required=True,
|
||||
help='Output file path (without extension for some formats)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--flatten',
|
||||
action='store_true',
|
||||
help='Flatten nested JSON structures for tabular formats'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--include-metadata',
|
||||
action='store_true',
|
||||
help='Include original paper metadata in output'
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_results(input_path: Path) -> Dict:
|
||||
"""Load validated results from JSON file"""
|
||||
with open(input_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def flatten_dict(d: Dict, parent_key: str = '', sep: str = '_') -> Dict:
|
||||
"""
|
||||
Flatten nested dictionary structure.
|
||||
Useful for converting JSON to tabular format.
|
||||
"""
|
||||
items = []
|
||||
for k, v in d.items():
|
||||
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
||||
if isinstance(v, dict):
|
||||
items.extend(flatten_dict(v, new_key, sep=sep).items())
|
||||
elif isinstance(v, list):
|
||||
# Convert lists to comma-separated strings
|
||||
if v and isinstance(v[0], dict):
|
||||
# List of dicts - create numbered columns
|
||||
for i, item in enumerate(v):
|
||||
items.extend(flatten_dict(item, f"{new_key}_{i}", sep=sep).items())
|
||||
else:
|
||||
# Simple list
|
||||
items.append((new_key, ', '.join(str(x) for x in v)))
|
||||
else:
|
||||
items.append((new_key, v))
|
||||
return dict(items)
|
||||
|
||||
|
||||
def extract_records(results: Dict, flatten: bool = False, include_metadata: bool = False) -> List[Dict]:
|
||||
"""
|
||||
Extract records from results structure.
|
||||
Returns a list of dictionaries suitable for tabular export.
|
||||
"""
|
||||
records = []
|
||||
|
||||
for paper_id, result in results.items():
|
||||
if result.get('status') != 'success':
|
||||
continue
|
||||
|
||||
# Get the validated data (or fall back to extracted data)
|
||||
data = result.get('validated_data', result.get('extracted_data', {}))
|
||||
|
||||
if not data:
|
||||
continue
|
||||
|
||||
# Check if data contains nested records or is a single record
|
||||
if 'records' in data and isinstance(data['records'], list):
|
||||
# Multiple records per paper
|
||||
for record in data['records']:
|
||||
record_dict = record.copy() if isinstance(record, dict) else {'value': record}
|
||||
|
||||
# Add paper-level fields
|
||||
if include_metadata:
|
||||
record_dict['paper_id'] = paper_id
|
||||
for key in data:
|
||||
if key != 'records':
|
||||
record_dict[f'paper_{key}'] = data[key]
|
||||
|
||||
if flatten:
|
||||
record_dict = flatten_dict(record_dict)
|
||||
|
||||
records.append(record_dict)
|
||||
else:
|
||||
# Single record per paper
|
||||
record_dict = data.copy()
|
||||
if include_metadata:
|
||||
record_dict['paper_id'] = paper_id
|
||||
|
||||
if flatten:
|
||||
record_dict = flatten_dict(record_dict)
|
||||
|
||||
records.append(record_dict)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
def export_to_csv(records: List[Dict], output_path: Path):
|
||||
"""Export to CSV format"""
|
||||
if not records:
|
||||
print("No records to export")
|
||||
return
|
||||
|
||||
# Get all possible field names
|
||||
fieldnames = set()
|
||||
for record in records:
|
||||
fieldnames.update(record.keys())
|
||||
fieldnames = sorted(fieldnames)
|
||||
|
||||
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(records)
|
||||
|
||||
print(f"Exported {len(records)} records to CSV: {output_path}")
|
||||
|
||||
|
||||
def export_to_json(records: List[Dict], output_path: Path):
|
||||
"""Export to JSON format"""
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Exported {len(records)} records to JSON: {output_path}")
|
||||
|
||||
|
||||
def export_to_python(records: List[Dict], output_path: Path):
|
||||
"""Export to Python format (pandas DataFrame pickle)"""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
print("Error: pandas is required for Python export. Install with: pip install pandas")
|
||||
sys.exit(1)
|
||||
|
||||
df = pd.DataFrame(records)
|
||||
|
||||
# Save as pickle
|
||||
pickle_path = output_path.with_suffix('.pkl')
|
||||
df.to_pickle(pickle_path)
|
||||
print(f"Exported {len(records)} records to pandas pickle: {pickle_path}")
|
||||
|
||||
# Also create a Python script to load it
|
||||
script_path = output_path.with_suffix('.py')
|
||||
script_content = f'''#!/usr/bin/env python3
|
||||
"""
|
||||
Data loading script
|
||||
Generated by extract_from_pdfs skill
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Load the data
|
||||
df = pd.read_pickle('{pickle_path.name}')
|
||||
|
||||
print(f"Loaded {{len(df)}} records")
|
||||
print(f"Columns: {{list(df.columns)}}")
|
||||
print("\\nFirst few rows:")
|
||||
print(df.head())
|
||||
|
||||
# Example analyses:
|
||||
# df.describe()
|
||||
# df.groupby('some_column').size()
|
||||
# df.to_csv('output.csv', index=False)
|
||||
'''
|
||||
|
||||
with open(script_path, 'w') as f:
|
||||
f.write(script_content)
|
||||
|
||||
print(f"Created loading script: {script_path}")
|
||||
|
||||
|
||||
def export_to_r(records: List[Dict], output_path: Path):
|
||||
"""Export to R format (RDS file)"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import pyreadr
|
||||
except ImportError:
|
||||
print("Error: pandas and pyreadr are required for R export.")
|
||||
print("Install with: pip install pandas pyreadr")
|
||||
sys.exit(1)
|
||||
|
||||
df = pd.DataFrame(records)
|
||||
|
||||
# Save as RDS
|
||||
rds_path = output_path.with_suffix('.rds')
|
||||
pyreadr.write_rds(rds_path, df)
|
||||
print(f"Exported {len(records)} records to RDS: {rds_path}")
|
||||
|
||||
# Also create an R script to load it
|
||||
script_path = output_path.with_suffix('.R')
|
||||
script_content = f'''# Data loading script
|
||||
# Generated by extract_from_pdfs skill
|
||||
|
||||
# Load the data
|
||||
data <- readRDS('{rds_path.name}')
|
||||
|
||||
cat(sprintf("Loaded %d records\\n", nrow(data)))
|
||||
cat(sprintf("Columns: %s\\n", paste(colnames(data), collapse=", ")))
|
||||
cat("\\nFirst few rows:\\n")
|
||||
print(head(data))
|
||||
|
||||
# Example analyses:
|
||||
# summary(data)
|
||||
# table(data$some_column)
|
||||
# write.csv(data, 'output.csv', row.names=FALSE)
|
||||
'''
|
||||
|
||||
with open(script_path, 'w') as f:
|
||||
f.write(script_content)
|
||||
|
||||
print(f"Created loading script: {script_path}")
|
||||
|
||||
|
||||
def export_to_excel(records: List[Dict], output_path: Path):
|
||||
"""Export to Excel format"""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
print("Error: pandas is required for Excel export. Install with: pip install pandas openpyxl")
|
||||
sys.exit(1)
|
||||
|
||||
df = pd.DataFrame(records)
|
||||
|
||||
# Save as Excel
|
||||
excel_path = output_path.with_suffix('.xlsx')
|
||||
df.to_excel(excel_path, index=False, engine='openpyxl')
|
||||
print(f"Exported {len(records)} records to Excel: {excel_path}")
|
||||
|
||||
|
||||
def export_to_sqlite(records: List[Dict], output_path: Path):
|
||||
"""Export to SQLite database"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import sqlite3
|
||||
except ImportError:
|
||||
print("Error: pandas is required for SQLite export. Install with: pip install pandas")
|
||||
sys.exit(1)
|
||||
|
||||
df = pd.DataFrame(records)
|
||||
|
||||
# Create database
|
||||
db_path = output_path.with_suffix('.db')
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
# Write to database
|
||||
table_name = 'extracted_data'
|
||||
df.to_sql(table_name, conn, if_exists='replace', index=False)
|
||||
|
||||
conn.close()
|
||||
print(f"Exported {len(records)} records to SQLite database: {db_path}")
|
||||
print(f"Table name: {table_name}")
|
||||
|
||||
# Create SQL script with example queries
|
||||
sql_script_path = output_path.with_suffix('.sql')
|
||||
sql_content = f'''-- Example SQL queries for {db_path.name}
|
||||
-- Generated by extract_from_pdfs skill
|
||||
|
||||
-- View all records
|
||||
SELECT * FROM {table_name} LIMIT 10;
|
||||
|
||||
-- Count total records
|
||||
SELECT COUNT(*) as total_records FROM {table_name};
|
||||
|
||||
-- Example: Group by a column (adjust column name as needed)
|
||||
-- SELECT column_name, COUNT(*) as count
|
||||
-- FROM {table_name}
|
||||
-- GROUP BY column_name
|
||||
-- ORDER BY count DESC;
|
||||
'''
|
||||
|
||||
with open(sql_script_path, 'w') as f:
|
||||
f.write(sql_content)
|
||||
|
||||
print(f"Created SQL example script: {sql_script_path}")
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Load validated results
|
||||
results = load_results(Path(args.input))
|
||||
print(f"Loaded {len(results)} results")
|
||||
|
||||
# Extract records
|
||||
records = extract_records(
|
||||
results,
|
||||
flatten=args.flatten,
|
||||
include_metadata=args.include_metadata
|
||||
)
|
||||
print(f"Extracted {len(records)} records")
|
||||
|
||||
if not records:
|
||||
print("No records to export. Check your data.")
|
||||
return
|
||||
|
||||
# Export based on format
|
||||
output_path = Path(args.output)
|
||||
|
||||
if args.format == 'csv':
|
||||
export_to_csv(records, output_path)
|
||||
elif args.format == 'json':
|
||||
export_to_json(records, output_path)
|
||||
elif args.format == 'python':
|
||||
export_to_python(records, output_path)
|
||||
elif args.format == 'r':
|
||||
export_to_r(records, output_path)
|
||||
elif args.format == 'excel':
|
||||
export_to_excel(records, output_path)
|
||||
elif args.format == 'sqlite':
|
||||
export_to_sqlite(records, output_path)
|
||||
|
||||
print(f"\nExport complete!")
|
||||
print(f"Your data is ready for analysis in {args.format.upper()} format.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user