Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:18:03 +08:00
commit 31ff8e1c29
18 changed files with 5925 additions and 0 deletions

View File

@@ -0,0 +1,436 @@
#!/usr/bin/env python3
"""
Download complete Fabric workspace including items and lakehouse files.
Uses the same path syntax as fab CLI commands.
Usage:
python3 download_workspace.py "Workspace.Workspace" [output_dir]
python3 download_workspace.py "Sales.Workspace" ./backup
python3 download_workspace.py "Production.Workspace" --no-lakehouse-files
Requirements:
- fab CLI installed and authenticated
- azure-storage-file-datalake (for lakehouse files)
- azure-identity
"""
import subprocess
import json
import sys
import argparse
from pathlib import Path
from collections import defaultdict
try:
from azure.storage.filedatalake import DataLakeServiceClient
from azure.identity import DefaultAzureCredential
AZURE_AVAILABLE = True
except ImportError:
AZURE_AVAILABLE = False
#region Helper Functions
def run_fab_command(args: list) -> str:
"""
Execute fab CLI command and return output.
Args:
args: Command arguments as list
Returns:
Command stdout
Raises:
subprocess.CalledProcessError if command fails
"""
result = subprocess.run(
["fab"] + args,
capture_output=True,
text=True,
check=True
)
return result.stdout.strip()
def parse_workspace_path(path: str) -> str:
"""
Parse and normalize workspace path.
Args:
path: Workspace path (with or without .Workspace)
Returns:
Normalized path with .Workspace extension
"""
if ".Workspace" not in path:
return f"{path}.Workspace"
return path
def get_workspace_items(workspace_path: str) -> list:
"""
Get all items in workspace.
Args:
workspace_path: Full workspace path (e.g., "Sales.Workspace")
Returns:
List of items with metadata
"""
output = run_fab_command(["ls", workspace_path, "-l"])
lines = output.strip().split('\n')
if len(lines) < 2:
return []
items = []
for line in lines[2:]:
parts = line.split()
if len(parts) >= 2:
item_id = parts[-1]
display_name = ' '.join(parts[:-1])
if '.' in display_name:
name, item_type = display_name.rsplit('.', 1)
else:
name = display_name
item_type = "Unknown"
items.append({
"displayName": name,
"type": item_type,
"id": item_id
})
return items
def export_item(workspace_path: str, item_name: str, item_type: str, output_path: Path) -> bool:
"""
Export item using fab export.
Args:
workspace_path: Workspace path
item_name: Item display name
item_type: Item type
output_path: Output directory
Returns:
True if successful
"""
item_path = f"{workspace_path}/{item_name}.{item_type}"
try:
subprocess.run(
["fab", "export", item_path, "-o", str(output_path), "-f"],
capture_output=True,
text=True,
check=True,
timeout=300
)
return True
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
print(f" Failed to export {item_name}: {e}")
return False
#endregion
#region Lakehouse Operations
def download_lakehouse_files(workspace_id: str, lakehouse_id: str, lakehouse_name: str, output_dir: Path):
"""
Download all files from lakehouse using OneLake Storage API.
Args:
workspace_id: Workspace GUID
lakehouse_id: Lakehouse GUID
lakehouse_name: Lakehouse display name
output_dir: Output directory for files
"""
if not AZURE_AVAILABLE:
print(f" Skipping lakehouse files (azure-storage-file-datalake not installed)")
return
print(f"\n Downloading lakehouse files from {lakehouse_name}...")
try:
account_url = "https://onelake.dfs.fabric.microsoft.com"
credential = DefaultAzureCredential()
service_client = DataLakeServiceClient(account_url=account_url, credential=credential)
fs_client = service_client.get_file_system_client(file_system=workspace_id)
base_path = f"{lakehouse_id}/Files"
try:
paths = fs_client.get_paths(path=base_path, recursive=True)
file_count = 0
dir_count = 0
for path in paths:
relative_path = path.name[len(base_path)+1:] if len(path.name) > len(base_path) else path.name
if path.is_directory:
local_dir = output_dir / relative_path
local_dir.mkdir(parents=True, exist_ok=True)
dir_count += 1
else:
local_file = output_dir / relative_path
local_file.parent.mkdir(parents=True, exist_ok=True)
file_client = fs_client.get_file_client(path.name)
with open(local_file, 'wb') as f:
download = file_client.download_file()
f.write(download.readall())
file_count += 1
print(f" {relative_path}")
print(f" Downloaded {file_count} files, {dir_count} directories")
except Exception as e:
if "404" in str(e) or "PathNotFound" in str(e):
print(f" No files found in lakehouse")
else:
raise
except Exception as e:
print(f" Error downloading lakehouse files: {e}")
def list_lakehouse_tables(workspace_path: str, lakehouse_name: str) -> list:
"""
List tables in lakehouse.
Args:
workspace_path: Workspace path
lakehouse_name: Lakehouse name
Returns:
List of table names
"""
try:
tables_path = f"{workspace_path}/{lakehouse_name}.Lakehouse/Tables"
output = run_fab_command(["ls", tables_path])
lines = output.strip().split('\n')
tables = [line.strip() for line in lines if line.strip() and not line.startswith('---')]
return tables
except subprocess.CalledProcessError:
return []
def export_table_schema(workspace_path: str, lakehouse_name: str, table_name: str, output_file: Path) -> bool:
"""
Export table schema.
Args:
workspace_path: Workspace path
lakehouse_name: Lakehouse name
table_name: Table name
output_file: Output JSON file
Returns:
True if successful
"""
try:
table_path = f"{workspace_path}/{lakehouse_name}.Lakehouse/Tables/{table_name}"
schema_output = run_fab_command(["table", "schema", table_path])
with open(output_file, 'w') as f:
f.write(schema_output)
return True
except subprocess.CalledProcessError as e:
print(f" Failed to export schema for {table_name}: {e}")
return False
#endregion
#region Main Download
def download_workspace(workspace_path: str, output_dir: Path, download_lakehouse_files_flag: bool = True):
"""
Download complete workspace contents.
Args:
workspace_path: Workspace path (e.g., "Sales.Workspace")
output_dir: Output directory
download_lakehouse_files_flag: Whether to download lakehouse files
"""
print(f"Downloading workspace: {workspace_path}")
print(f"Output directory: {output_dir}")
print()
output_dir.mkdir(parents=True, exist_ok=True)
# Get workspace ID
print("Getting workspace ID...")
workspace_id = run_fab_command(["get", workspace_path, "-q", "id"])
print(f"Workspace ID: {workspace_id}")
print()
# Get all items
print("Discovering workspace items...")
items = get_workspace_items(workspace_path)
if not items:
print("No items found")
return
# Group by type
items_by_type = defaultdict(list)
for item in items:
items_by_type[item["type"]].append(item)
print(f"Found {len(items)} items across {len(items_by_type)} types:")
for item_type, type_items in sorted(items_by_type.items()):
print(f" {item_type}: {len(type_items)}")
print()
# Track statistics
total_success = 0
total_failed = 0
lakehouses = []
# Download items by type
for item_type, type_items in sorted(items_by_type.items()):
print(f"Downloading {item_type} items ({len(type_items)})...")
type_dir = output_dir / item_type
type_dir.mkdir(parents=True, exist_ok=True)
for item in type_items:
item_name = item["displayName"]
item_id = item["id"]
print(f" {item_name}...")
if item_type == "Lakehouse" and download_lakehouse_files_flag:
lakehouses.append({
"name": item_name,
"id": item_id,
"output_dir": type_dir / f"{item_name}.Lakehouse"
})
success = export_item(workspace_path, item_name, item_type, type_dir)
if success:
total_success += 1
print(f" Done: {item_name}")
else:
total_failed += 1
print()
# Download lakehouse files
if lakehouses and download_lakehouse_files_flag:
print(f"Downloading lakehouse files ({len(lakehouses)} lakehouses)...")
for lh in lakehouses:
lh_files_dir = lh["output_dir"] / "Files"
lh_files_dir.mkdir(parents=True, exist_ok=True)
download_lakehouse_files(
workspace_id=workspace_id,
lakehouse_id=lh["id"],
lakehouse_name=lh["name"],
output_dir=lh_files_dir
)
# Export table schemas
print(f"\n Exporting table schemas from {lh['name']}...")
tables = list_lakehouse_tables(workspace_path, lh["name"])
if tables:
tables_dir = lh["output_dir"] / "Tables"
tables_dir.mkdir(parents=True, exist_ok=True)
for table in tables:
schema_file = tables_dir / f"{table}_schema.json"
if export_table_schema(workspace_path, lh["name"], table, schema_file):
print(f" {table}")
print(f" Exported {len(tables)} table schemas")
else:
print(f" No tables found")
print()
# Summary
print("=" * 60)
print("Download Summary")
print("=" * 60)
print(f"Successfully downloaded: {total_success}")
print(f"Failed: {total_failed}")
print(f"Output directory: {output_dir.absolute()}")
#endregion
#region Main
def main():
parser = argparse.ArgumentParser(
description="Download complete Fabric workspace",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python3 download_workspace.py "Sales.Workspace"
python3 download_workspace.py "Production.Workspace" ./backup
python3 download_workspace.py "dev.Workspace" --no-lakehouse-files
"""
)
parser.add_argument("workspace", help="Workspace path: Name.Workspace or just Name")
parser.add_argument("output_dir", nargs="?", default=None,
help="Output directory (default: ./workspace_downloads/<name>)")
parser.add_argument("--no-lakehouse-files", action="store_true",
help="Skip downloading lakehouse files")
args = parser.parse_args()
workspace_path = parse_workspace_path(args.workspace)
# Extract name for default output dir
workspace_name = workspace_path.replace(".Workspace", "")
if args.output_dir:
output_dir = Path(args.output_dir)
else:
output_dir = Path("./workspace_downloads") / workspace_name
try:
download_workspace(
workspace_path=workspace_path,
output_dir=output_dir,
download_lakehouse_files_flag=not args.no_lakehouse_files
)
except KeyboardInterrupt:
print("\n\nDownload interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()
#endregion