Initial commit

2025-11-29 18:18:03 +08:00
commit 31ff8e1c29
18 changed files with 5925 additions and 0 deletions
--- a/skills/fabric-cli/scripts/download_workspace.py
+++ b/skills/fabric-cli/scripts/download_workspace.py
@@ -0,0 +1,436 @@
+#!/usr/bin/env python3
+"""
+Download complete Fabric workspace including items and lakehouse files.
+
+Uses the same path syntax as fab CLI commands.
+
+Usage:
+    python3 download_workspace.py "Workspace.Workspace" [output_dir]
+    python3 download_workspace.py "Sales.Workspace" ./backup
+    python3 download_workspace.py "Production.Workspace" --no-lakehouse-files
+
+Requirements:
+    - fab CLI installed and authenticated
+    - azure-storage-file-datalake (for lakehouse files)
+    - azure-identity
+"""
+
+import subprocess
+import json
+import sys
+import argparse
+from pathlib import Path
+from collections import defaultdict
+
+try:
+    from azure.storage.filedatalake import DataLakeServiceClient
+    from azure.identity import DefaultAzureCredential
+    AZURE_AVAILABLE = True
+except ImportError:
+    AZURE_AVAILABLE = False
+
+
+#region Helper Functions
+
+
+def run_fab_command(args: list) -> str:
+    """
+    Execute fab CLI command and return output.
+
+    Args:
+        args: Command arguments as list
+
+    Returns:
+        Command stdout
+
+    Raises:
+        subprocess.CalledProcessError if command fails
+    """
+    result = subprocess.run(
+        ["fab"] + args,
+        capture_output=True,
+        text=True,
+        check=True
+    )
+    return result.stdout.strip()
+
+
+def parse_workspace_path(path: str) -> str:
+    """
+    Parse and normalize workspace path.
+
+    Args:
+        path: Workspace path (with or without .Workspace)
+
+    Returns:
+        Normalized path with .Workspace extension
+    """
+    if ".Workspace" not in path:
+        return f"{path}.Workspace"
+    return path
+
+
+def get_workspace_items(workspace_path: str) -> list:
+    """
+    Get all items in workspace.
+
+    Args:
+        workspace_path: Full workspace path (e.g., "Sales.Workspace")
+
+    Returns:
+        List of items with metadata
+    """
+    output = run_fab_command(["ls", workspace_path, "-l"])
+
+    lines = output.strip().split('\n')
+    if len(lines) < 2:
+        return []
+
+    items = []
+    for line in lines[2:]:
+        parts = line.split()
+        if len(parts) >= 2:
+            item_id = parts[-1]
+            display_name = ' '.join(parts[:-1])
+
+            if '.' in display_name:
+                name, item_type = display_name.rsplit('.', 1)
+            else:
+                name = display_name
+                item_type = "Unknown"
+
+            items.append({
+                "displayName": name,
+                "type": item_type,
+                "id": item_id
+            })
+
+    return items
+
+
+def export_item(workspace_path: str, item_name: str, item_type: str, output_path: Path) -> bool:
+    """
+    Export item using fab export.
+
+    Args:
+        workspace_path: Workspace path
+        item_name: Item display name
+        item_type: Item type
+        output_path: Output directory
+
+    Returns:
+        True if successful
+    """
+    item_path = f"{workspace_path}/{item_name}.{item_type}"
+
+    try:
+        subprocess.run(
+            ["fab", "export", item_path, "-o", str(output_path), "-f"],
+            capture_output=True,
+            text=True,
+            check=True,
+            timeout=300
+        )
+        return True
+    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
+        print(f"  Failed to export {item_name}: {e}")
+        return False
+
+
+#endregion
+
+
+#region Lakehouse Operations
+
+
+def download_lakehouse_files(workspace_id: str, lakehouse_id: str, lakehouse_name: str, output_dir: Path):
+    """
+    Download all files from lakehouse using OneLake Storage API.
+
+    Args:
+        workspace_id: Workspace GUID
+        lakehouse_id: Lakehouse GUID
+        lakehouse_name: Lakehouse display name
+        output_dir: Output directory for files
+    """
+    if not AZURE_AVAILABLE:
+        print(f"  Skipping lakehouse files (azure-storage-file-datalake not installed)")
+        return
+
+    print(f"\n  Downloading lakehouse files from {lakehouse_name}...")
+
+    try:
+        account_url = "https://onelake.dfs.fabric.microsoft.com"
+        credential = DefaultAzureCredential()
+        service_client = DataLakeServiceClient(account_url=account_url, credential=credential)
+
+        fs_client = service_client.get_file_system_client(file_system=workspace_id)
+        base_path = f"{lakehouse_id}/Files"
+
+        try:
+            paths = fs_client.get_paths(path=base_path, recursive=True)
+
+            file_count = 0
+            dir_count = 0
+
+            for path in paths:
+                relative_path = path.name[len(base_path)+1:] if len(path.name) > len(base_path) else path.name
+
+                if path.is_directory:
+                    local_dir = output_dir / relative_path
+                    local_dir.mkdir(parents=True, exist_ok=True)
+                    dir_count += 1
+                else:
+                    local_file = output_dir / relative_path
+                    local_file.parent.mkdir(parents=True, exist_ok=True)
+
+                    file_client = fs_client.get_file_client(path.name)
+
+                    with open(local_file, 'wb') as f:
+                        download = file_client.download_file()
+                        f.write(download.readall())
+
+                    file_count += 1
+                    print(f"    {relative_path}")
+
+            print(f"  Downloaded {file_count} files, {dir_count} directories")
+
+        except Exception as e:
+            if "404" in str(e) or "PathNotFound" in str(e):
+                print(f"  No files found in lakehouse")
+            else:
+                raise
+
+    except Exception as e:
+        print(f"  Error downloading lakehouse files: {e}")
+
+
+def list_lakehouse_tables(workspace_path: str, lakehouse_name: str) -> list:
+    """
+    List tables in lakehouse.
+
+    Args:
+        workspace_path: Workspace path
+        lakehouse_name: Lakehouse name
+
+    Returns:
+        List of table names
+    """
+    try:
+        tables_path = f"{workspace_path}/{lakehouse_name}.Lakehouse/Tables"
+        output = run_fab_command(["ls", tables_path])
+
+        lines = output.strip().split('\n')
+        tables = [line.strip() for line in lines if line.strip() and not line.startswith('---')]
+
+        return tables
+    except subprocess.CalledProcessError:
+        return []
+
+
+def export_table_schema(workspace_path: str, lakehouse_name: str, table_name: str, output_file: Path) -> bool:
+    """
+    Export table schema.
+
+    Args:
+        workspace_path: Workspace path
+        lakehouse_name: Lakehouse name
+        table_name: Table name
+        output_file: Output JSON file
+
+    Returns:
+        True if successful
+    """
+    try:
+        table_path = f"{workspace_path}/{lakehouse_name}.Lakehouse/Tables/{table_name}"
+        schema_output = run_fab_command(["table", "schema", table_path])
+
+        with open(output_file, 'w') as f:
+            f.write(schema_output)
+
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"    Failed to export schema for {table_name}: {e}")
+        return False
+
+
+#endregion
+
+
+#region Main Download
+
+
+def download_workspace(workspace_path: str, output_dir: Path, download_lakehouse_files_flag: bool = True):
+    """
+    Download complete workspace contents.
+
+    Args:
+        workspace_path: Workspace path (e.g., "Sales.Workspace")
+        output_dir: Output directory
+        download_lakehouse_files_flag: Whether to download lakehouse files
+    """
+    print(f"Downloading workspace: {workspace_path}")
+    print(f"Output directory: {output_dir}")
+    print()
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Get workspace ID
+    print("Getting workspace ID...")
+    workspace_id = run_fab_command(["get", workspace_path, "-q", "id"])
+    print(f"Workspace ID: {workspace_id}")
+    print()
+
+    # Get all items
+    print("Discovering workspace items...")
+    items = get_workspace_items(workspace_path)
+
+    if not items:
+        print("No items found")
+        return
+
+    # Group by type
+    items_by_type = defaultdict(list)
+    for item in items:
+        items_by_type[item["type"]].append(item)
+
+    print(f"Found {len(items)} items across {len(items_by_type)} types:")
+    for item_type, type_items in sorted(items_by_type.items()):
+        print(f"   {item_type}: {len(type_items)}")
+    print()
+
+    # Track statistics
+    total_success = 0
+    total_failed = 0
+    lakehouses = []
+
+    # Download items by type
+    for item_type, type_items in sorted(items_by_type.items()):
+        print(f"Downloading {item_type} items ({len(type_items)})...")
+
+        type_dir = output_dir / item_type
+        type_dir.mkdir(parents=True, exist_ok=True)
+
+        for item in type_items:
+            item_name = item["displayName"]
+            item_id = item["id"]
+
+            print(f"  {item_name}...")
+
+            if item_type == "Lakehouse" and download_lakehouse_files_flag:
+                lakehouses.append({
+                    "name": item_name,
+                    "id": item_id,
+                    "output_dir": type_dir / f"{item_name}.Lakehouse"
+                })
+
+            success = export_item(workspace_path, item_name, item_type, type_dir)
+
+            if success:
+                total_success += 1
+                print(f"  Done: {item_name}")
+            else:
+                total_failed += 1
+
+        print()
+
+    # Download lakehouse files
+    if lakehouses and download_lakehouse_files_flag:
+        print(f"Downloading lakehouse files ({len(lakehouses)} lakehouses)...")
+
+        for lh in lakehouses:
+            lh_files_dir = lh["output_dir"] / "Files"
+            lh_files_dir.mkdir(parents=True, exist_ok=True)
+
+            download_lakehouse_files(
+                workspace_id=workspace_id,
+                lakehouse_id=lh["id"],
+                lakehouse_name=lh["name"],
+                output_dir=lh_files_dir
+            )
+
+            # Export table schemas
+            print(f"\n  Exporting table schemas from {lh['name']}...")
+            tables = list_lakehouse_tables(workspace_path, lh["name"])
+
+            if tables:
+                tables_dir = lh["output_dir"] / "Tables"
+                tables_dir.mkdir(parents=True, exist_ok=True)
+
+                for table in tables:
+                    schema_file = tables_dir / f"{table}_schema.json"
+                    if export_table_schema(workspace_path, lh["name"], table, schema_file):
+                        print(f"    {table}")
+
+                print(f"  Exported {len(tables)} table schemas")
+            else:
+                print(f"  No tables found")
+
+        print()
+
+    # Summary
+    print("=" * 60)
+    print("Download Summary")
+    print("=" * 60)
+    print(f"Successfully downloaded: {total_success}")
+    print(f"Failed: {total_failed}")
+    print(f"Output directory: {output_dir.absolute()}")
+
+
+#endregion
+
+
+#region Main
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download complete Fabric workspace",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    python3 download_workspace.py "Sales.Workspace"
+    python3 download_workspace.py "Production.Workspace" ./backup
+    python3 download_workspace.py "dev.Workspace" --no-lakehouse-files
+        """
+    )
+
+    parser.add_argument("workspace", help="Workspace path: Name.Workspace or just Name")
+    parser.add_argument("output_dir", nargs="?", default=None,
+                        help="Output directory (default: ./workspace_downloads/<name>)")
+    parser.add_argument("--no-lakehouse-files", action="store_true",
+                        help="Skip downloading lakehouse files")
+
+    args = parser.parse_args()
+
+    workspace_path = parse_workspace_path(args.workspace)
+
+    # Extract name for default output dir
+    workspace_name = workspace_path.replace(".Workspace", "")
+
+    if args.output_dir:
+        output_dir = Path(args.output_dir)
+    else:
+        output_dir = Path("./workspace_downloads") / workspace_name
+
+    try:
+        download_workspace(
+            workspace_path=workspace_path,
+            output_dir=output_dir,
+            download_lakehouse_files_flag=not args.no_lakehouse_files
+        )
+    except KeyboardInterrupt:
+        print("\n\nDownload interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\nError: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
+
+
+#endregion