gh-whamp-whamp-claude-tools…/skills/pocketbase/scripts/import_data.py

#!/usr/bin/env python3
#!/usr/bin/env python3
"""PocketBase data import helper with admin auth, batching, optional upsert, and dry-run."""

import argparse
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from getpass import getpass
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Tuple

import requests


REQUEST_TIMEOUT = 30
DEFAULT_BATCH_SIZE = 100
DROP_KEYS = {"id", "created", "updated", "@collectionId", "@collectionName", "@expand"}


def authenticate(base_url: str, email: Optional[str], password: Optional[str]) -> Dict[str, str]:
    if not email:
        return {}
    if not password:
        password = getpass(prompt="Admin password: ")
    response = requests.post(
        f"{base_url}/api/admins/auth-with-password",
        json={"identity": email, "password": password},
        timeout=REQUEST_TIMEOUT,
    )
    response.raise_for_status()
    token = response.json().get("token")
    if not token:
        raise RuntimeError("Authentication response missing token")
    return {"Authorization": f"Bearer {token}"}


def list_collections(base_url: str, headers: Dict[str, str]) -> Dict[str, Dict]:
    collections: Dict[str, Dict] = {}
    page = 1
    while True:
        response = requests.get(
            f"{base_url}/api/collections",
            params={"page": page, "perPage": 200},
            headers=headers,
            timeout=REQUEST_TIMEOUT,
        )
        response.raise_for_status()
        payload = response.json()
        items = payload.get("items", [])
        for item in items:
            if item.get("name"):
                collections[item["name"]] = item
        total = payload.get("totalItems", len(collections))
        if page * 200 >= total or not items:
            break
        page += 1
    return collections


def chunked(iterable: Iterable[Dict], size: int) -> Iterator[List[Dict]]:
    chunk: List[Dict] = []
    for item in iterable:
        chunk.append(item)
        if len(chunk) >= size:
            yield chunk
            chunk = []
    if chunk:
        yield chunk


def iter_ndjson(file_path: Path) -> Iterator[Dict]:
    with file_path.open("r", encoding="utf-8") as handle:
        for line in handle:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)


def load_json_records(file_path: Path) -> Tuple[List[Dict], Optional[str]]:
    with file_path.open("r", encoding="utf-8") as handle:
        payload = json.load(handle)
    if isinstance(payload, dict):
        return payload.get("items", []), payload.get("collection")
    if isinstance(payload, list):
        return payload, None
    raise ValueError(f"Unsupported JSON structure in {file_path}")


def clean_record(record: Dict) -> Dict:
    return {k: v for k, v in record.items() if k not in DROP_KEYS}


def prepend_items(items: Iterable[Dict], iterator: Iterator[Dict]) -> Iterator[Dict]:
    for item in items:
        yield item
    for item in iterator:
        yield item


def build_filter(field: str, value) -> str:
    if value is None:
        return f"{field} = null"
    if isinstance(value, bool):
        return f"{field} = {str(value).lower()}"
    if isinstance(value, (int, float)):
        return f"{field} = {value}"
    escaped = str(value).replace("\"", r"\"")
    return f'{field} = "{escaped}"'


def request_with_retry(session: requests.Session, method: str, url: str, *, retries: int = 3, backoff: float = 1.0, **kwargs) -> requests.Response:
    last_response: Optional[requests.Response] = None
    for attempt in range(retries):
        response = session.request(method, url, timeout=REQUEST_TIMEOUT, **kwargs)
        status = response.status_code
        if status in {429, 503} and attempt < retries - 1:
            time.sleep(backoff)
            backoff = min(backoff * 2, 8)
            last_response = response
            continue
        if status >= 400:
            response.raise_for_status()
        return response
    assert last_response is not None
    last_response.raise_for_status()


def find_existing(
    base_url: str,
    collection: str,
    field: str,
    value,
    headers: Dict[str, str],
) -> Optional[Dict]:
    session = requests.Session()
    try:
        response = request_with_retry(
            session,
            "get",
            f"{base_url}/api/collections/{collection}/records",
            headers=headers,
            params={
                "page": 1,
                "perPage": 1,
                "filter": build_filter(field, value),
                "skipTotal": 1,
            },
        )
        items = response.json().get("items", [])
        if items:
            return items[0]
        return None
    finally:
        session.close()


def process_record(
    base_url: str,
    collection: str,
    record: Dict,
    headers: Dict[str, str],
    upsert_field: Optional[str],
    dry_run: bool,
) -> Tuple[bool, Optional[str]]:
    data = clean_record(record)
    if dry_run:
        return True, None
    session = requests.Session()
    try:
        url = f"{base_url}/api/collections/{collection}/records"
        if upsert_field and upsert_field in record:
            existing = find_existing(base_url, collection, upsert_field, record.get(upsert_field), headers)
            if existing:
                record_id = existing.get("id")
                if record_id:
                    response = request_with_retry(
                        session,
                        "patch",
                        f"{url}/{record_id}",
                        headers=headers,
                        json=data,
                    )
                    return response.ok, None
        response = request_with_retry(
            session,
            "post",
            url,
            headers=headers,
            json=data,
        )
        return response.status_code in {200, 201}, None
    except requests.HTTPError as exc:
        return False, f"HTTP {exc.response.status_code}: {exc.response.text[:200]}"
    except Exception as exc:  # noqa: BLE001
        return False, str(exc)
    finally:
        session.close()


def parse_upsert(args: argparse.Namespace) -> Dict[str, str]:
    mapping: Dict[str, str] = {}
    for item in args.upsert or []:
        if "=" not in item:
            raise ValueError(f"Invalid upsert mapping '{item}'. Use collection=field or *=field")
        collection, field = item.split("=", 1)
        mapping[collection.strip()] = field.strip()
    return mapping


def infer_collection(file_path: Path, first_record: Optional[Dict]) -> str:
    if first_record and first_record.get("@collectionName"):
        return first_record["@collectionName"]
    return file_path.stem


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Import PocketBase data dumps")
    parser.add_argument("base_url", help="PocketBase base URL, e.g. http://127.0.0.1:8090")
    parser.add_argument("input_path", help="Directory or file with export data")
    parser.add_argument("--email", help="Admin email for authentication")
    parser.add_argument("--password", help="Admin password (omit to prompt)")
    parser.add_argument("--collections", help="Comma-separated collections to include")
    parser.add_argument("--exclude", help="Comma-separated collections to skip")
    parser.add_argument("--upsert", action="append", help="collection=field mapping (use *=field for default)")
    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help="Records per batch")
    parser.add_argument("--concurrency", type=int, default=4, help="Concurrent workers per batch")
    parser.add_argument("--throttle", type=float, default=0.0, help="Seconds to sleep between batches")
    parser.add_argument("--dry-run", action="store_true", help="Parse files without writing to PocketBase")
    parser.add_argument("--skip-missing", action="store_true", help="Skip files whose collections do not exist")
    return parser.parse_args()


def main():
    args = parse_args()
    base_url = args.base_url.rstrip("/")
    input_path = Path(args.input_path)
    if not input_path.exists():
        raise SystemExit(f"Input path {input_path} does not exist")

    headers = authenticate(base_url, args.email, args.password)
    collections = list_collections(base_url, headers)

    include = {c.strip() for c in args.collections.split(",")} if args.collections else None
    exclude = {c.strip() for c in args.exclude.split(",")} if args.exclude else set()
    upsert_map = parse_upsert(args)

    if input_path.is_file():
        files = [input_path]
    else:
        files = sorted(
            p for p in input_path.iterdir() if p.is_file() and p.suffix.lower() in {".json", ".ndjson"}
        )

    if not files:
        raise SystemExit("No data files found")

    for file_path in files:
        if file_path.stem == "manifest":
            continue

        if file_path.suffix.lower() == ".ndjson":
            iterator = iter_ndjson(file_path)
            peeked: List[Dict] = []
            try:
                first_record = next(iterator)
                peeked.append(first_record)
            except StopIteration:
                print(f"Skipping {file_path.name}: no records")
                continue
            source_iter = prepend_items(peeked, iterator)
            meta_collection = None
        else:
            records, meta_collection = load_json_records(file_path)
            if not records:
                print(f"Skipping {file_path.name}: no records")
                continue
            first_record = records[0]
            source_iter = iter(records)

        collection = meta_collection or infer_collection(file_path, first_record)
        if include and collection not in include:
            continue
        if collection in exclude:
            continue
        if collection not in collections:
            if args.skip_missing:
                print(f"Skipping {file_path.name}: collection '{collection}' not found")
                continue
            raise SystemExit(f"Collection '{collection}' not found in PocketBase")

        print(f"Importing {file_path.name} -> {collection}")
        total = success = 0
        failures: List[str] = []
        field = upsert_map.get(collection, upsert_map.get("*"))

        source_iter = prepend_items(peeked, iterator)
        for batch in chunked(source_iter, max(args.batch_size, 1)):
            workers = max(args.concurrency, 1)
            if workers == 1:
                for record in batch:
                    ok, error = process_record(base_url, collection, record, headers, field, args.dry_run)
                    total += 1
                    success += int(ok)
                    if not ok and error:
                        failures.append(error)
            else:
                with ThreadPoolExecutor(max_workers=workers) as executor:
                    futures = {
                        executor.submit(
                            process_record,
                            base_url,
                            collection,
                            record,
                            headers,
                            field,
                            args.dry_run,
                        ): record
                        for record in batch
                    }
                    for future in as_completed(futures):
                        ok, error = future.result()
                        total += 1
                        success += int(ok)
                        if not ok and error:
                            failures.append(error)
            if args.throttle > 0:
                time.sleep(args.throttle)

        print(f"  {success}/{total} records processed")
        if failures:
            print(f"  {len(failures)} failures (showing up to 3):")
            for message in failures[:3]:
                print(f"    - {message}")


if __name__ == "__main__":
    main()