gh-data-goblin-fabric-cli-p…/skills/fabric-cli/scripts/download_workspace.py

#!/usr/bin/env python3
"""
Download complete Fabric workspace including items and lakehouse files.

Uses the same path syntax as fab CLI commands.

Usage:
    python3 download_workspace.py "Workspace.Workspace" [output_dir]
    python3 download_workspace.py "Sales.Workspace" ./backup
    python3 download_workspace.py "Production.Workspace" --no-lakehouse-files

Requirements:
    - fab CLI installed and authenticated
    - azure-storage-file-datalake (for lakehouse files)
    - azure-identity
"""

import subprocess
import json
import sys
import argparse
from pathlib import Path
from collections import defaultdict

try:
    from azure.storage.filedatalake import DataLakeServiceClient
    from azure.identity import DefaultAzureCredential
    AZURE_AVAILABLE = True
except ImportError:
    AZURE_AVAILABLE = False


#region Helper Functions


def run_fab_command(args: list) -> str:
    """
    Execute fab CLI command and return output.

    Args:
        args: Command arguments as list

    Returns:
        Command stdout

    Raises:
        subprocess.CalledProcessError if command fails
    """
    result = subprocess.run(
        ["fab"] + args,
        capture_output=True,
        text=True,
        check=True
    )
    return result.stdout.strip()


def parse_workspace_path(path: str) -> str:
    """
    Parse and normalize workspace path.

    Args:
        path: Workspace path (with or without .Workspace)

    Returns:
        Normalized path with .Workspace extension
    """
    if ".Workspace" not in path:
        return f"{path}.Workspace"
    return path


def get_workspace_items(workspace_path: str) -> list:
    """
    Get all items in workspace.

    Args:
        workspace_path: Full workspace path (e.g., "Sales.Workspace")

    Returns:
        List of items with metadata
    """
    output = run_fab_command(["ls", workspace_path, "-l"])

    lines = output.strip().split('\n')
    if len(lines) < 2:
        return []

    items = []
    for line in lines[2:]:
        parts = line.split()
        if len(parts) >= 2:
            item_id = parts[-1]
            display_name = ' '.join(parts[:-1])

            if '.' in display_name:
                name, item_type = display_name.rsplit('.', 1)
            else:
                name = display_name
                item_type = "Unknown"

            items.append({
                "displayName": name,
                "type": item_type,
                "id": item_id
            })

    return items


def export_item(workspace_path: str, item_name: str, item_type: str, output_path: Path) -> bool:
    """
    Export item using fab export.

    Args:
        workspace_path: Workspace path
        item_name: Item display name
        item_type: Item type
        output_path: Output directory

    Returns:
        True if successful
    """
    item_path = f"{workspace_path}/{item_name}.{item_type}"

    try:
        subprocess.run(
            ["fab", "export", item_path, "-o", str(output_path), "-f"],
            capture_output=True,
            text=True,
            check=True,
            timeout=300
        )
        return True
    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
        print(f"  Failed to export {item_name}: {e}")
        return False


#endregion


#region Lakehouse Operations


def download_lakehouse_files(workspace_id: str, lakehouse_id: str, lakehouse_name: str, output_dir: Path):
    """
    Download all files from lakehouse using OneLake Storage API.

    Args:
        workspace_id: Workspace GUID
        lakehouse_id: Lakehouse GUID
        lakehouse_name: Lakehouse display name
        output_dir: Output directory for files
    """
    if not AZURE_AVAILABLE:
        print(f"  Skipping lakehouse files (azure-storage-file-datalake not installed)")
        return

    print(f"\n  Downloading lakehouse files from {lakehouse_name}...")

    try:
        account_url = "https://onelake.dfs.fabric.microsoft.com"
        credential = DefaultAzureCredential()
        service_client = DataLakeServiceClient(account_url=account_url, credential=credential)

        fs_client = service_client.get_file_system_client(file_system=workspace_id)
        base_path = f"{lakehouse_id}/Files"

        try:
            paths = fs_client.get_paths(path=base_path, recursive=True)

            file_count = 0
            dir_count = 0

            for path in paths:
                relative_path = path.name[len(base_path)+1:] if len(path.name) > len(base_path) else path.name

                if path.is_directory:
                    local_dir = output_dir / relative_path
                    local_dir.mkdir(parents=True, exist_ok=True)
                    dir_count += 1
                else:
                    local_file = output_dir / relative_path
                    local_file.parent.mkdir(parents=True, exist_ok=True)

                    file_client = fs_client.get_file_client(path.name)

                    with open(local_file, 'wb') as f:
                        download = file_client.download_file()
                        f.write(download.readall())

                    file_count += 1
                    print(f"    {relative_path}")

            print(f"  Downloaded {file_count} files, {dir_count} directories")

        except Exception as e:
            if "404" in str(e) or "PathNotFound" in str(e):
                print(f"  No files found in lakehouse")
            else:
                raise

    except Exception as e:
        print(f"  Error downloading lakehouse files: {e}")


def list_lakehouse_tables(workspace_path: str, lakehouse_name: str) -> list:
    """
    List tables in lakehouse.

    Args:
        workspace_path: Workspace path
        lakehouse_name: Lakehouse name

    Returns:
        List of table names
    """
    try:
        tables_path = f"{workspace_path}/{lakehouse_name}.Lakehouse/Tables"
        output = run_fab_command(["ls", tables_path])

        lines = output.strip().split('\n')
        tables = [line.strip() for line in lines if line.strip() and not line.startswith('---')]

        return tables
    except subprocess.CalledProcessError:
        return []


def export_table_schema(workspace_path: str, lakehouse_name: str, table_name: str, output_file: Path) -> bool:
    """
    Export table schema.

    Args:
        workspace_path: Workspace path
        lakehouse_name: Lakehouse name
        table_name: Table name
        output_file: Output JSON file

    Returns:
        True if successful
    """
    try:
        table_path = f"{workspace_path}/{lakehouse_name}.Lakehouse/Tables/{table_name}"
        schema_output = run_fab_command(["table", "schema", table_path])

        with open(output_file, 'w') as f:
            f.write(schema_output)

        return True
    except subprocess.CalledProcessError as e:
        print(f"    Failed to export schema for {table_name}: {e}")
        return False


#endregion


#region Main Download


def download_workspace(workspace_path: str, output_dir: Path, download_lakehouse_files_flag: bool = True):
    """
    Download complete workspace contents.

    Args:
        workspace_path: Workspace path (e.g., "Sales.Workspace")
        output_dir: Output directory
        download_lakehouse_files_flag: Whether to download lakehouse files
    """
    print(f"Downloading workspace: {workspace_path}")
    print(f"Output directory: {output_dir}")
    print()

    output_dir.mkdir(parents=True, exist_ok=True)

    # Get workspace ID
    print("Getting workspace ID...")
    workspace_id = run_fab_command(["get", workspace_path, "-q", "id"])
    print(f"Workspace ID: {workspace_id}")
    print()

    # Get all items
    print("Discovering workspace items...")
    items = get_workspace_items(workspace_path)

    if not items:
        print("No items found")
        return

    # Group by type
    items_by_type = defaultdict(list)
    for item in items:
        items_by_type[item["type"]].append(item)

    print(f"Found {len(items)} items across {len(items_by_type)} types:")
    for item_type, type_items in sorted(items_by_type.items()):
        print(f"   {item_type}: {len(type_items)}")
    print()

    # Track statistics
    total_success = 0
    total_failed = 0
    lakehouses = []

    # Download items by type
    for item_type, type_items in sorted(items_by_type.items()):
        print(f"Downloading {item_type} items ({len(type_items)})...")

        type_dir = output_dir / item_type
        type_dir.mkdir(parents=True, exist_ok=True)

        for item in type_items:
            item_name = item["displayName"]
            item_id = item["id"]

            print(f"  {item_name}...")

            if item_type == "Lakehouse" and download_lakehouse_files_flag:
                lakehouses.append({
                    "name": item_name,
                    "id": item_id,
                    "output_dir": type_dir / f"{item_name}.Lakehouse"
                })

            success = export_item(workspace_path, item_name, item_type, type_dir)

            if success:
                total_success += 1
                print(f"  Done: {item_name}")
            else:
                total_failed += 1

        print()

    # Download lakehouse files
    if lakehouses and download_lakehouse_files_flag:
        print(f"Downloading lakehouse files ({len(lakehouses)} lakehouses)...")

        for lh in lakehouses:
            lh_files_dir = lh["output_dir"] / "Files"
            lh_files_dir.mkdir(parents=True, exist_ok=True)

            download_lakehouse_files(
                workspace_id=workspace_id,
                lakehouse_id=lh["id"],
                lakehouse_name=lh["name"],
                output_dir=lh_files_dir
            )

            # Export table schemas
            print(f"\n  Exporting table schemas from {lh['name']}...")
            tables = list_lakehouse_tables(workspace_path, lh["name"])

            if tables:
                tables_dir = lh["output_dir"] / "Tables"
                tables_dir.mkdir(parents=True, exist_ok=True)

                for table in tables:
                    schema_file = tables_dir / f"{table}_schema.json"
                    if export_table_schema(workspace_path, lh["name"], table, schema_file):
                        print(f"    {table}")

                print(f"  Exported {len(tables)} table schemas")
            else:
                print(f"  No tables found")

        print()

    # Summary
    print("=" * 60)
    print("Download Summary")
    print("=" * 60)
    print(f"Successfully downloaded: {total_success}")
    print(f"Failed: {total_failed}")
    print(f"Output directory: {output_dir.absolute()}")


#endregion


#region Main


def main():
    parser = argparse.ArgumentParser(
        description="Download complete Fabric workspace",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    python3 download_workspace.py "Sales.Workspace"
    python3 download_workspace.py "Production.Workspace" ./backup
    python3 download_workspace.py "dev.Workspace" --no-lakehouse-files
        """
    )

    parser.add_argument("workspace", help="Workspace path: Name.Workspace or just Name")
    parser.add_argument("output_dir", nargs="?", default=None,
                        help="Output directory (default: ./workspace_downloads/<name>)")
    parser.add_argument("--no-lakehouse-files", action="store_true",
                        help="Skip downloading lakehouse files")

    args = parser.parse_args()

    workspace_path = parse_workspace_path(args.workspace)

    # Extract name for default output dir
    workspace_name = workspace_path.replace(".Workspace", "")

    if args.output_dir:
        output_dir = Path(args.output_dir)
    else:
        output_dir = Path("./workspace_downloads") / workspace_name

    try:
        download_workspace(
            workspace_path=workspace_path,
            output_dir=output_dir,
            download_lakehouse_files_flag=not args.no_lakehouse_files
        )
    except KeyboardInterrupt:
        print("\n\nDownload interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()


#endregion