Initial commit
This commit is contained in:
436
skills/fabric-cli/scripts/download_workspace.py
Executable file
436
skills/fabric-cli/scripts/download_workspace.py
Executable file
@@ -0,0 +1,436 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download complete Fabric workspace including items and lakehouse files.
|
||||
|
||||
Uses the same path syntax as fab CLI commands.
|
||||
|
||||
Usage:
|
||||
python3 download_workspace.py "Workspace.Workspace" [output_dir]
|
||||
python3 download_workspace.py "Sales.Workspace" ./backup
|
||||
python3 download_workspace.py "Production.Workspace" --no-lakehouse-files
|
||||
|
||||
Requirements:
|
||||
- fab CLI installed and authenticated
|
||||
- azure-storage-file-datalake (for lakehouse files)
|
||||
- azure-identity
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
try:
|
||||
from azure.storage.filedatalake import DataLakeServiceClient
|
||||
from azure.identity import DefaultAzureCredential
|
||||
AZURE_AVAILABLE = True
|
||||
except ImportError:
|
||||
AZURE_AVAILABLE = False
|
||||
|
||||
|
||||
#region Helper Functions
|
||||
|
||||
|
||||
def run_fab_command(args: list) -> str:
|
||||
"""
|
||||
Execute fab CLI command and return output.
|
||||
|
||||
Args:
|
||||
args: Command arguments as list
|
||||
|
||||
Returns:
|
||||
Command stdout
|
||||
|
||||
Raises:
|
||||
subprocess.CalledProcessError if command fails
|
||||
"""
|
||||
result = subprocess.run(
|
||||
["fab"] + args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def parse_workspace_path(path: str) -> str:
|
||||
"""
|
||||
Parse and normalize workspace path.
|
||||
|
||||
Args:
|
||||
path: Workspace path (with or without .Workspace)
|
||||
|
||||
Returns:
|
||||
Normalized path with .Workspace extension
|
||||
"""
|
||||
if ".Workspace" not in path:
|
||||
return f"{path}.Workspace"
|
||||
return path
|
||||
|
||||
|
||||
def get_workspace_items(workspace_path: str) -> list:
|
||||
"""
|
||||
Get all items in workspace.
|
||||
|
||||
Args:
|
||||
workspace_path: Full workspace path (e.g., "Sales.Workspace")
|
||||
|
||||
Returns:
|
||||
List of items with metadata
|
||||
"""
|
||||
output = run_fab_command(["ls", workspace_path, "-l"])
|
||||
|
||||
lines = output.strip().split('\n')
|
||||
if len(lines) < 2:
|
||||
return []
|
||||
|
||||
items = []
|
||||
for line in lines[2:]:
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
item_id = parts[-1]
|
||||
display_name = ' '.join(parts[:-1])
|
||||
|
||||
if '.' in display_name:
|
||||
name, item_type = display_name.rsplit('.', 1)
|
||||
else:
|
||||
name = display_name
|
||||
item_type = "Unknown"
|
||||
|
||||
items.append({
|
||||
"displayName": name,
|
||||
"type": item_type,
|
||||
"id": item_id
|
||||
})
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def export_item(workspace_path: str, item_name: str, item_type: str, output_path: Path) -> bool:
|
||||
"""
|
||||
Export item using fab export.
|
||||
|
||||
Args:
|
||||
workspace_path: Workspace path
|
||||
item_name: Item display name
|
||||
item_type: Item type
|
||||
output_path: Output directory
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
item_path = f"{workspace_path}/{item_name}.{item_type}"
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
["fab", "export", item_path, "-o", str(output_path), "-f"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
timeout=300
|
||||
)
|
||||
return True
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
||||
print(f" Failed to export {item_name}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
#endregion
|
||||
|
||||
|
||||
#region Lakehouse Operations
|
||||
|
||||
|
||||
def download_lakehouse_files(workspace_id: str, lakehouse_id: str, lakehouse_name: str, output_dir: Path):
|
||||
"""
|
||||
Download all files from lakehouse using OneLake Storage API.
|
||||
|
||||
Args:
|
||||
workspace_id: Workspace GUID
|
||||
lakehouse_id: Lakehouse GUID
|
||||
lakehouse_name: Lakehouse display name
|
||||
output_dir: Output directory for files
|
||||
"""
|
||||
if not AZURE_AVAILABLE:
|
||||
print(f" Skipping lakehouse files (azure-storage-file-datalake not installed)")
|
||||
return
|
||||
|
||||
print(f"\n Downloading lakehouse files from {lakehouse_name}...")
|
||||
|
||||
try:
|
||||
account_url = "https://onelake.dfs.fabric.microsoft.com"
|
||||
credential = DefaultAzureCredential()
|
||||
service_client = DataLakeServiceClient(account_url=account_url, credential=credential)
|
||||
|
||||
fs_client = service_client.get_file_system_client(file_system=workspace_id)
|
||||
base_path = f"{lakehouse_id}/Files"
|
||||
|
||||
try:
|
||||
paths = fs_client.get_paths(path=base_path, recursive=True)
|
||||
|
||||
file_count = 0
|
||||
dir_count = 0
|
||||
|
||||
for path in paths:
|
||||
relative_path = path.name[len(base_path)+1:] if len(path.name) > len(base_path) else path.name
|
||||
|
||||
if path.is_directory:
|
||||
local_dir = output_dir / relative_path
|
||||
local_dir.mkdir(parents=True, exist_ok=True)
|
||||
dir_count += 1
|
||||
else:
|
||||
local_file = output_dir / relative_path
|
||||
local_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
file_client = fs_client.get_file_client(path.name)
|
||||
|
||||
with open(local_file, 'wb') as f:
|
||||
download = file_client.download_file()
|
||||
f.write(download.readall())
|
||||
|
||||
file_count += 1
|
||||
print(f" {relative_path}")
|
||||
|
||||
print(f" Downloaded {file_count} files, {dir_count} directories")
|
||||
|
||||
except Exception as e:
|
||||
if "404" in str(e) or "PathNotFound" in str(e):
|
||||
print(f" No files found in lakehouse")
|
||||
else:
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error downloading lakehouse files: {e}")
|
||||
|
||||
|
||||
def list_lakehouse_tables(workspace_path: str, lakehouse_name: str) -> list:
|
||||
"""
|
||||
List tables in lakehouse.
|
||||
|
||||
Args:
|
||||
workspace_path: Workspace path
|
||||
lakehouse_name: Lakehouse name
|
||||
|
||||
Returns:
|
||||
List of table names
|
||||
"""
|
||||
try:
|
||||
tables_path = f"{workspace_path}/{lakehouse_name}.Lakehouse/Tables"
|
||||
output = run_fab_command(["ls", tables_path])
|
||||
|
||||
lines = output.strip().split('\n')
|
||||
tables = [line.strip() for line in lines if line.strip() and not line.startswith('---')]
|
||||
|
||||
return tables
|
||||
except subprocess.CalledProcessError:
|
||||
return []
|
||||
|
||||
|
||||
def export_table_schema(workspace_path: str, lakehouse_name: str, table_name: str, output_file: Path) -> bool:
|
||||
"""
|
||||
Export table schema.
|
||||
|
||||
Args:
|
||||
workspace_path: Workspace path
|
||||
lakehouse_name: Lakehouse name
|
||||
table_name: Table name
|
||||
output_file: Output JSON file
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
try:
|
||||
table_path = f"{workspace_path}/{lakehouse_name}.Lakehouse/Tables/{table_name}"
|
||||
schema_output = run_fab_command(["table", "schema", table_path])
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(schema_output)
|
||||
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f" Failed to export schema for {table_name}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
#endregion
|
||||
|
||||
|
||||
#region Main Download
|
||||
|
||||
|
||||
def download_workspace(workspace_path: str, output_dir: Path, download_lakehouse_files_flag: bool = True):
|
||||
"""
|
||||
Download complete workspace contents.
|
||||
|
||||
Args:
|
||||
workspace_path: Workspace path (e.g., "Sales.Workspace")
|
||||
output_dir: Output directory
|
||||
download_lakehouse_files_flag: Whether to download lakehouse files
|
||||
"""
|
||||
print(f"Downloading workspace: {workspace_path}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
print()
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Get workspace ID
|
||||
print("Getting workspace ID...")
|
||||
workspace_id = run_fab_command(["get", workspace_path, "-q", "id"])
|
||||
print(f"Workspace ID: {workspace_id}")
|
||||
print()
|
||||
|
||||
# Get all items
|
||||
print("Discovering workspace items...")
|
||||
items = get_workspace_items(workspace_path)
|
||||
|
||||
if not items:
|
||||
print("No items found")
|
||||
return
|
||||
|
||||
# Group by type
|
||||
items_by_type = defaultdict(list)
|
||||
for item in items:
|
||||
items_by_type[item["type"]].append(item)
|
||||
|
||||
print(f"Found {len(items)} items across {len(items_by_type)} types:")
|
||||
for item_type, type_items in sorted(items_by_type.items()):
|
||||
print(f" {item_type}: {len(type_items)}")
|
||||
print()
|
||||
|
||||
# Track statistics
|
||||
total_success = 0
|
||||
total_failed = 0
|
||||
lakehouses = []
|
||||
|
||||
# Download items by type
|
||||
for item_type, type_items in sorted(items_by_type.items()):
|
||||
print(f"Downloading {item_type} items ({len(type_items)})...")
|
||||
|
||||
type_dir = output_dir / item_type
|
||||
type_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for item in type_items:
|
||||
item_name = item["displayName"]
|
||||
item_id = item["id"]
|
||||
|
||||
print(f" {item_name}...")
|
||||
|
||||
if item_type == "Lakehouse" and download_lakehouse_files_flag:
|
||||
lakehouses.append({
|
||||
"name": item_name,
|
||||
"id": item_id,
|
||||
"output_dir": type_dir / f"{item_name}.Lakehouse"
|
||||
})
|
||||
|
||||
success = export_item(workspace_path, item_name, item_type, type_dir)
|
||||
|
||||
if success:
|
||||
total_success += 1
|
||||
print(f" Done: {item_name}")
|
||||
else:
|
||||
total_failed += 1
|
||||
|
||||
print()
|
||||
|
||||
# Download lakehouse files
|
||||
if lakehouses and download_lakehouse_files_flag:
|
||||
print(f"Downloading lakehouse files ({len(lakehouses)} lakehouses)...")
|
||||
|
||||
for lh in lakehouses:
|
||||
lh_files_dir = lh["output_dir"] / "Files"
|
||||
lh_files_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
download_lakehouse_files(
|
||||
workspace_id=workspace_id,
|
||||
lakehouse_id=lh["id"],
|
||||
lakehouse_name=lh["name"],
|
||||
output_dir=lh_files_dir
|
||||
)
|
||||
|
||||
# Export table schemas
|
||||
print(f"\n Exporting table schemas from {lh['name']}...")
|
||||
tables = list_lakehouse_tables(workspace_path, lh["name"])
|
||||
|
||||
if tables:
|
||||
tables_dir = lh["output_dir"] / "Tables"
|
||||
tables_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for table in tables:
|
||||
schema_file = tables_dir / f"{table}_schema.json"
|
||||
if export_table_schema(workspace_path, lh["name"], table, schema_file):
|
||||
print(f" {table}")
|
||||
|
||||
print(f" Exported {len(tables)} table schemas")
|
||||
else:
|
||||
print(f" No tables found")
|
||||
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print("=" * 60)
|
||||
print("Download Summary")
|
||||
print("=" * 60)
|
||||
print(f"Successfully downloaded: {total_success}")
|
||||
print(f"Failed: {total_failed}")
|
||||
print(f"Output directory: {output_dir.absolute()}")
|
||||
|
||||
|
||||
#endregion
|
||||
|
||||
|
||||
#region Main
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download complete Fabric workspace",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python3 download_workspace.py "Sales.Workspace"
|
||||
python3 download_workspace.py "Production.Workspace" ./backup
|
||||
python3 download_workspace.py "dev.Workspace" --no-lakehouse-files
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument("workspace", help="Workspace path: Name.Workspace or just Name")
|
||||
parser.add_argument("output_dir", nargs="?", default=None,
|
||||
help="Output directory (default: ./workspace_downloads/<name>)")
|
||||
parser.add_argument("--no-lakehouse-files", action="store_true",
|
||||
help="Skip downloading lakehouse files")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
workspace_path = parse_workspace_path(args.workspace)
|
||||
|
||||
# Extract name for default output dir
|
||||
workspace_name = workspace_path.replace(".Workspace", "")
|
||||
|
||||
if args.output_dir:
|
||||
output_dir = Path(args.output_dir)
|
||||
else:
|
||||
output_dir = Path("./workspace_downloads") / workspace_name
|
||||
|
||||
try:
|
||||
download_workspace(
|
||||
workspace_path=workspace_path,
|
||||
output_dir=output_dir,
|
||||
download_lakehouse_files_flag=not args.no_lakehouse_files
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nDownload interrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\nError: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
#endregion
|
||||
Reference in New Issue
Block a user