Initial commit

2025-11-29 18:02:40 +08:00
commit 69617b598e
25 changed files with 5790 additions and 0 deletions
--- a/skills/extract_from_pdfs/scripts/06_export_database.py
+++ b/skills/extract_from_pdfs/scripts/06_export_database.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+"""
+Export validated data to various analysis formats.
+Supports Python (pandas/SQLite), R (RDS/CSV), Excel, and more.
+"""
+
+import argparse
+import json
+import csv
+from pathlib import Path
+from typing import Dict, List, Any
+import sys
+
+
+def parse_args():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(
+        description='Export validated data to analysis format'
+    )
+    parser.add_argument(
+        '--input',
+        required=True,
+        help='Input JSON file with validated data from step 05'
+    )
+    parser.add_argument(
+        '--format',
+        choices=['python', 'r', 'csv', 'json', 'excel', 'sqlite'],
+        required=True,
+        help='Output format'
+    )
+    parser.add_argument(
+        '--output',
+        required=True,
+        help='Output file path (without extension for some formats)'
+    )
+    parser.add_argument(
+        '--flatten',
+        action='store_true',
+        help='Flatten nested JSON structures for tabular formats'
+    )
+    parser.add_argument(
+        '--include-metadata',
+        action='store_true',
+        help='Include original paper metadata in output'
+    )
+    return parser.parse_args()
+
+
+def load_results(input_path: Path) -> Dict:
+    """Load validated results from JSON file"""
+    with open(input_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def flatten_dict(d: Dict, parent_key: str = '', sep: str = '_') -> Dict:
+    """
+    Flatten nested dictionary structure.
+    Useful for converting JSON to tabular format.
+    """
+    items = []
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        elif isinstance(v, list):
+            # Convert lists to comma-separated strings
+            if v and isinstance(v[0], dict):
+                # List of dicts - create numbered columns
+                for i, item in enumerate(v):
+                    items.extend(flatten_dict(item, f"{new_key}_{i}", sep=sep).items())
+            else:
+                # Simple list
+                items.append((new_key, ', '.join(str(x) for x in v)))
+        else:
+            items.append((new_key, v))
+    return dict(items)
+
+
+def extract_records(results: Dict, flatten: bool = False, include_metadata: bool = False) -> List[Dict]:
+    """
+    Extract records from results structure.
+    Returns a list of dictionaries suitable for tabular export.
+    """
+    records = []
+
+    for paper_id, result in results.items():
+        if result.get('status') != 'success':
+            continue
+
+        # Get the validated data (or fall back to extracted data)
+        data = result.get('validated_data', result.get('extracted_data', {}))
+
+        if not data:
+            continue
+
+        # Check if data contains nested records or is a single record
+        if 'records' in data and isinstance(data['records'], list):
+            # Multiple records per paper
+            for record in data['records']:
+                record_dict = record.copy() if isinstance(record, dict) else {'value': record}
+
+                # Add paper-level fields
+                if include_metadata:
+                    record_dict['paper_id'] = paper_id
+                    for key in data:
+                        if key != 'records':
+                            record_dict[f'paper_{key}'] = data[key]
+
+                if flatten:
+                    record_dict = flatten_dict(record_dict)
+
+                records.append(record_dict)
+        else:
+            # Single record per paper
+            record_dict = data.copy()
+            if include_metadata:
+                record_dict['paper_id'] = paper_id
+
+            if flatten:
+                record_dict = flatten_dict(record_dict)
+
+            records.append(record_dict)
+
+    return records
+
+
+def export_to_csv(records: List[Dict], output_path: Path):
+    """Export to CSV format"""
+    if not records:
+        print("No records to export")
+        return
+
+    # Get all possible field names
+    fieldnames = set()
+    for record in records:
+        fieldnames.update(record.keys())
+    fieldnames = sorted(fieldnames)
+
+    with open(output_path, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(records)
+
+    print(f"Exported {len(records)} records to CSV: {output_path}")
+
+
+def export_to_json(records: List[Dict], output_path: Path):
+    """Export to JSON format"""
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(records, f, indent=2, ensure_ascii=False)
+
+    print(f"Exported {len(records)} records to JSON: {output_path}")
+
+
+def export_to_python(records: List[Dict], output_path: Path):
+    """Export to Python format (pandas DataFrame pickle)"""
+    try:
+        import pandas as pd
+    except ImportError:
+        print("Error: pandas is required for Python export. Install with: pip install pandas")
+        sys.exit(1)
+
+    df = pd.DataFrame(records)
+
+    # Save as pickle
+    pickle_path = output_path.with_suffix('.pkl')
+    df.to_pickle(pickle_path)
+    print(f"Exported {len(records)} records to pandas pickle: {pickle_path}")
+
+    # Also create a Python script to load it
+    script_path = output_path.with_suffix('.py')
+    script_content = f'''#!/usr/bin/env python3
+"""
+Data loading script
+Generated by extract_from_pdfs skill
+"""
+
+import pandas as pd
+
+# Load the data
+df = pd.read_pickle('{pickle_path.name}')
+
+print(f"Loaded {{len(df)}} records")
+print(f"Columns: {{list(df.columns)}}")
+print("\\nFirst few rows:")
+print(df.head())
+
+# Example analyses:
+# df.describe()
+# df.groupby('some_column').size()
+# df.to_csv('output.csv', index=False)
+'''
+
+    with open(script_path, 'w') as f:
+        f.write(script_content)
+
+    print(f"Created loading script: {script_path}")
+
+
+def export_to_r(records: List[Dict], output_path: Path):
+    """Export to R format (RDS file)"""
+    try:
+        import pandas as pd
+        import pyreadr
+    except ImportError:
+        print("Error: pandas and pyreadr are required for R export.")
+        print("Install with: pip install pandas pyreadr")
+        sys.exit(1)
+
+    df = pd.DataFrame(records)
+
+    # Save as RDS
+    rds_path = output_path.with_suffix('.rds')
+    pyreadr.write_rds(rds_path, df)
+    print(f"Exported {len(records)} records to RDS: {rds_path}")
+
+    # Also create an R script to load it
+    script_path = output_path.with_suffix('.R')
+    script_content = f'''# Data loading script
+# Generated by extract_from_pdfs skill
+
+# Load the data
+data <- readRDS('{rds_path.name}')
+
+cat(sprintf("Loaded %d records\\n", nrow(data)))
+cat(sprintf("Columns: %s\\n", paste(colnames(data), collapse=", ")))
+cat("\\nFirst few rows:\\n")
+print(head(data))
+
+# Example analyses:
+# summary(data)
+# table(data$some_column)
+# write.csv(data, 'output.csv', row.names=FALSE)
+'''
+
+    with open(script_path, 'w') as f:
+        f.write(script_content)
+
+    print(f"Created loading script: {script_path}")
+
+
+def export_to_excel(records: List[Dict], output_path: Path):
+    """Export to Excel format"""
+    try:
+        import pandas as pd
+    except ImportError:
+        print("Error: pandas is required for Excel export. Install with: pip install pandas openpyxl")
+        sys.exit(1)
+
+    df = pd.DataFrame(records)
+
+    # Save as Excel
+    excel_path = output_path.with_suffix('.xlsx')
+    df.to_excel(excel_path, index=False, engine='openpyxl')
+    print(f"Exported {len(records)} records to Excel: {excel_path}")
+
+
+def export_to_sqlite(records: List[Dict], output_path: Path):
+    """Export to SQLite database"""
+    try:
+        import pandas as pd
+        import sqlite3
+    except ImportError:
+        print("Error: pandas is required for SQLite export. Install with: pip install pandas")
+        sys.exit(1)
+
+    df = pd.DataFrame(records)
+
+    # Create database
+    db_path = output_path.with_suffix('.db')
+    conn = sqlite3.connect(db_path)
+
+    # Write to database
+    table_name = 'extracted_data'
+    df.to_sql(table_name, conn, if_exists='replace', index=False)
+
+    conn.close()
+    print(f"Exported {len(records)} records to SQLite database: {db_path}")
+    print(f"Table name: {table_name}")
+
+    # Create SQL script with example queries
+    sql_script_path = output_path.with_suffix('.sql')
+    sql_content = f'''-- Example SQL queries for {db_path.name}
+-- Generated by extract_from_pdfs skill
+
+-- View all records
+SELECT * FROM {table_name} LIMIT 10;
+
+-- Count total records
+SELECT COUNT(*) as total_records FROM {table_name};
+
+-- Example: Group by a column (adjust column name as needed)
+-- SELECT column_name, COUNT(*) as count
+-- FROM {table_name}
+-- GROUP BY column_name
+-- ORDER BY count DESC;
+'''
+
+    with open(sql_script_path, 'w') as f:
+        f.write(sql_content)
+
+    print(f"Created SQL example script: {sql_script_path}")
+
+
+def main():
+    args = parse_args()
+
+    # Load validated results
+    results = load_results(Path(args.input))
+    print(f"Loaded {len(results)} results")
+
+    # Extract records
+    records = extract_records(
+        results,
+        flatten=args.flatten,
+        include_metadata=args.include_metadata
+    )
+    print(f"Extracted {len(records)} records")
+
+    if not records:
+        print("No records to export. Check your data.")
+        return
+
+    # Export based on format
+    output_path = Path(args.output)
+
+    if args.format == 'csv':
+        export_to_csv(records, output_path)
+    elif args.format == 'json':
+        export_to_json(records, output_path)
+    elif args.format == 'python':
+        export_to_python(records, output_path)
+    elif args.format == 'r':
+        export_to_r(records, output_path)
+    elif args.format == 'excel':
+        export_to_excel(records, output_path)
+    elif args.format == 'sqlite':
+        export_to_sqlite(records, output_path)
+
+    print(f"\nExport complete!")
+    print(f"Your data is ready for analysis in {args.format.upper()} format.")
+
+
+if __name__ == '__main__':
+    main()