Initial commit
This commit is contained in:
339
skills/pocketbase/scripts/import_data.py
Normal file
339
skills/pocketbase/scripts/import_data.py
Normal file
@@ -0,0 +1,339 @@
|
||||
#!/usr/bin/env python3
|
||||
#!/usr/bin/env python3
|
||||
"""PocketBase data import helper with admin auth, batching, optional upsert, and dry-run."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from getpass import getpass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
REQUEST_TIMEOUT = 30
|
||||
DEFAULT_BATCH_SIZE = 100
|
||||
DROP_KEYS = {"id", "created", "updated", "@collectionId", "@collectionName", "@expand"}
|
||||
|
||||
|
||||
def authenticate(base_url: str, email: Optional[str], password: Optional[str]) -> Dict[str, str]:
|
||||
if not email:
|
||||
return {}
|
||||
if not password:
|
||||
password = getpass(prompt="Admin password: ")
|
||||
response = requests.post(
|
||||
f"{base_url}/api/admins/auth-with-password",
|
||||
json={"identity": email, "password": password},
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
)
|
||||
response.raise_for_status()
|
||||
token = response.json().get("token")
|
||||
if not token:
|
||||
raise RuntimeError("Authentication response missing token")
|
||||
return {"Authorization": f"Bearer {token}"}
|
||||
|
||||
|
||||
def list_collections(base_url: str, headers: Dict[str, str]) -> Dict[str, Dict]:
|
||||
collections: Dict[str, Dict] = {}
|
||||
page = 1
|
||||
while True:
|
||||
response = requests.get(
|
||||
f"{base_url}/api/collections",
|
||||
params={"page": page, "perPage": 200},
|
||||
headers=headers,
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
items = payload.get("items", [])
|
||||
for item in items:
|
||||
if item.get("name"):
|
||||
collections[item["name"]] = item
|
||||
total = payload.get("totalItems", len(collections))
|
||||
if page * 200 >= total or not items:
|
||||
break
|
||||
page += 1
|
||||
return collections
|
||||
|
||||
|
||||
def chunked(iterable: Iterable[Dict], size: int) -> Iterator[List[Dict]]:
|
||||
chunk: List[Dict] = []
|
||||
for item in iterable:
|
||||
chunk.append(item)
|
||||
if len(chunk) >= size:
|
||||
yield chunk
|
||||
chunk = []
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
|
||||
def iter_ndjson(file_path: Path) -> Iterator[Dict]:
|
||||
with file_path.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def load_json_records(file_path: Path) -> Tuple[List[Dict], Optional[str]]:
|
||||
with file_path.open("r", encoding="utf-8") as handle:
|
||||
payload = json.load(handle)
|
||||
if isinstance(payload, dict):
|
||||
return payload.get("items", []), payload.get("collection")
|
||||
if isinstance(payload, list):
|
||||
return payload, None
|
||||
raise ValueError(f"Unsupported JSON structure in {file_path}")
|
||||
|
||||
|
||||
def clean_record(record: Dict) -> Dict:
|
||||
return {k: v for k, v in record.items() if k not in DROP_KEYS}
|
||||
|
||||
|
||||
def prepend_items(items: Iterable[Dict], iterator: Iterator[Dict]) -> Iterator[Dict]:
|
||||
for item in items:
|
||||
yield item
|
||||
for item in iterator:
|
||||
yield item
|
||||
|
||||
|
||||
def build_filter(field: str, value) -> str:
|
||||
if value is None:
|
||||
return f"{field} = null"
|
||||
if isinstance(value, bool):
|
||||
return f"{field} = {str(value).lower()}"
|
||||
if isinstance(value, (int, float)):
|
||||
return f"{field} = {value}"
|
||||
escaped = str(value).replace("\"", r"\"")
|
||||
return f'{field} = "{escaped}"'
|
||||
|
||||
|
||||
def request_with_retry(session: requests.Session, method: str, url: str, *, retries: int = 3, backoff: float = 1.0, **kwargs) -> requests.Response:
|
||||
last_response: Optional[requests.Response] = None
|
||||
for attempt in range(retries):
|
||||
response = session.request(method, url, timeout=REQUEST_TIMEOUT, **kwargs)
|
||||
status = response.status_code
|
||||
if status in {429, 503} and attempt < retries - 1:
|
||||
time.sleep(backoff)
|
||||
backoff = min(backoff * 2, 8)
|
||||
last_response = response
|
||||
continue
|
||||
if status >= 400:
|
||||
response.raise_for_status()
|
||||
return response
|
||||
assert last_response is not None
|
||||
last_response.raise_for_status()
|
||||
|
||||
|
||||
def find_existing(
|
||||
base_url: str,
|
||||
collection: str,
|
||||
field: str,
|
||||
value,
|
||||
headers: Dict[str, str],
|
||||
) -> Optional[Dict]:
|
||||
session = requests.Session()
|
||||
try:
|
||||
response = request_with_retry(
|
||||
session,
|
||||
"get",
|
||||
f"{base_url}/api/collections/{collection}/records",
|
||||
headers=headers,
|
||||
params={
|
||||
"page": 1,
|
||||
"perPage": 1,
|
||||
"filter": build_filter(field, value),
|
||||
"skipTotal": 1,
|
||||
},
|
||||
)
|
||||
items = response.json().get("items", [])
|
||||
if items:
|
||||
return items[0]
|
||||
return None
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def process_record(
|
||||
base_url: str,
|
||||
collection: str,
|
||||
record: Dict,
|
||||
headers: Dict[str, str],
|
||||
upsert_field: Optional[str],
|
||||
dry_run: bool,
|
||||
) -> Tuple[bool, Optional[str]]:
|
||||
data = clean_record(record)
|
||||
if dry_run:
|
||||
return True, None
|
||||
session = requests.Session()
|
||||
try:
|
||||
url = f"{base_url}/api/collections/{collection}/records"
|
||||
if upsert_field and upsert_field in record:
|
||||
existing = find_existing(base_url, collection, upsert_field, record.get(upsert_field), headers)
|
||||
if existing:
|
||||
record_id = existing.get("id")
|
||||
if record_id:
|
||||
response = request_with_retry(
|
||||
session,
|
||||
"patch",
|
||||
f"{url}/{record_id}",
|
||||
headers=headers,
|
||||
json=data,
|
||||
)
|
||||
return response.ok, None
|
||||
response = request_with_retry(
|
||||
session,
|
||||
"post",
|
||||
url,
|
||||
headers=headers,
|
||||
json=data,
|
||||
)
|
||||
return response.status_code in {200, 201}, None
|
||||
except requests.HTTPError as exc:
|
||||
return False, f"HTTP {exc.response.status_code}: {exc.response.text[:200]}"
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return False, str(exc)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def parse_upsert(args: argparse.Namespace) -> Dict[str, str]:
|
||||
mapping: Dict[str, str] = {}
|
||||
for item in args.upsert or []:
|
||||
if "=" not in item:
|
||||
raise ValueError(f"Invalid upsert mapping '{item}'. Use collection=field or *=field")
|
||||
collection, field = item.split("=", 1)
|
||||
mapping[collection.strip()] = field.strip()
|
||||
return mapping
|
||||
|
||||
|
||||
def infer_collection(file_path: Path, first_record: Optional[Dict]) -> str:
|
||||
if first_record and first_record.get("@collectionName"):
|
||||
return first_record["@collectionName"]
|
||||
return file_path.stem
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Import PocketBase data dumps")
|
||||
parser.add_argument("base_url", help="PocketBase base URL, e.g. http://127.0.0.1:8090")
|
||||
parser.add_argument("input_path", help="Directory or file with export data")
|
||||
parser.add_argument("--email", help="Admin email for authentication")
|
||||
parser.add_argument("--password", help="Admin password (omit to prompt)")
|
||||
parser.add_argument("--collections", help="Comma-separated collections to include")
|
||||
parser.add_argument("--exclude", help="Comma-separated collections to skip")
|
||||
parser.add_argument("--upsert", action="append", help="collection=field mapping (use *=field for default)")
|
||||
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, help="Records per batch")
|
||||
parser.add_argument("--concurrency", type=int, default=4, help="Concurrent workers per batch")
|
||||
parser.add_argument("--throttle", type=float, default=0.0, help="Seconds to sleep between batches")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Parse files without writing to PocketBase")
|
||||
parser.add_argument("--skip-missing", action="store_true", help="Skip files whose collections do not exist")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
base_url = args.base_url.rstrip("/")
|
||||
input_path = Path(args.input_path)
|
||||
if not input_path.exists():
|
||||
raise SystemExit(f"Input path {input_path} does not exist")
|
||||
|
||||
headers = authenticate(base_url, args.email, args.password)
|
||||
collections = list_collections(base_url, headers)
|
||||
|
||||
include = {c.strip() for c in args.collections.split(",")} if args.collections else None
|
||||
exclude = {c.strip() for c in args.exclude.split(",")} if args.exclude else set()
|
||||
upsert_map = parse_upsert(args)
|
||||
|
||||
if input_path.is_file():
|
||||
files = [input_path]
|
||||
else:
|
||||
files = sorted(
|
||||
p for p in input_path.iterdir() if p.is_file() and p.suffix.lower() in {".json", ".ndjson"}
|
||||
)
|
||||
|
||||
if not files:
|
||||
raise SystemExit("No data files found")
|
||||
|
||||
for file_path in files:
|
||||
if file_path.stem == "manifest":
|
||||
continue
|
||||
|
||||
if file_path.suffix.lower() == ".ndjson":
|
||||
iterator = iter_ndjson(file_path)
|
||||
peeked: List[Dict] = []
|
||||
try:
|
||||
first_record = next(iterator)
|
||||
peeked.append(first_record)
|
||||
except StopIteration:
|
||||
print(f"Skipping {file_path.name}: no records")
|
||||
continue
|
||||
source_iter = prepend_items(peeked, iterator)
|
||||
meta_collection = None
|
||||
else:
|
||||
records, meta_collection = load_json_records(file_path)
|
||||
if not records:
|
||||
print(f"Skipping {file_path.name}: no records")
|
||||
continue
|
||||
first_record = records[0]
|
||||
source_iter = iter(records)
|
||||
|
||||
collection = meta_collection or infer_collection(file_path, first_record)
|
||||
if include and collection not in include:
|
||||
continue
|
||||
if collection in exclude:
|
||||
continue
|
||||
if collection not in collections:
|
||||
if args.skip_missing:
|
||||
print(f"Skipping {file_path.name}: collection '{collection}' not found")
|
||||
continue
|
||||
raise SystemExit(f"Collection '{collection}' not found in PocketBase")
|
||||
|
||||
print(f"Importing {file_path.name} -> {collection}")
|
||||
total = success = 0
|
||||
failures: List[str] = []
|
||||
field = upsert_map.get(collection, upsert_map.get("*"))
|
||||
|
||||
source_iter = prepend_items(peeked, iterator)
|
||||
for batch in chunked(source_iter, max(args.batch_size, 1)):
|
||||
workers = max(args.concurrency, 1)
|
||||
if workers == 1:
|
||||
for record in batch:
|
||||
ok, error = process_record(base_url, collection, record, headers, field, args.dry_run)
|
||||
total += 1
|
||||
success += int(ok)
|
||||
if not ok and error:
|
||||
failures.append(error)
|
||||
else:
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
futures = {
|
||||
executor.submit(
|
||||
process_record,
|
||||
base_url,
|
||||
collection,
|
||||
record,
|
||||
headers,
|
||||
field,
|
||||
args.dry_run,
|
||||
): record
|
||||
for record in batch
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
ok, error = future.result()
|
||||
total += 1
|
||||
success += int(ok)
|
||||
if not ok and error:
|
||||
failures.append(error)
|
||||
if args.throttle > 0:
|
||||
time.sleep(args.throttle)
|
||||
|
||||
print(f" {success}/{total} records processed")
|
||||
if failures:
|
||||
print(f" {len(failures)} failures (showing up to 3):")
|
||||
for message in failures[:3]:
|
||||
print(f" - {message}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user