#!/usr/bin/env python3 """Generate interactive HTML file browser for must-gather extraction.""" import os import sys import json import hashlib import html as html_module from datetime import datetime from pathlib import Path def human_readable_size(size_bytes): """Convert bytes to human-readable format.""" for unit in ['B', 'KB', 'MB', 'GB', 'TB']: if size_bytes < 1024.0: return f"{size_bytes:.1f} {unit}" size_bytes /= 1024.0 return f"{size_bytes:.1f} PB" def get_file_type(filename): """Determine file type based on extension.""" ext = filename.lower().split('.')[-1] if '.' in filename else '' type_map = { 'log': 'log', 'txt': 'log', 'yaml': 'yaml', 'yml': 'yaml', 'json': 'json', 'xml': 'xml', 'crt': 'cert', 'pem': 'cert', 'key': 'cert', 'tar': 'archive', 'gz': 'archive', 'tgz': 'archive', 'zip': 'archive', 'sh': 'script', 'py': 'script', 'conf': 'config', 'cfg': 'config', 'ini': 'config', } return type_map.get(ext, 'other') def get_file_icon(file_type): """Get icon character for file type.""" icons = { 'log': '📄', 'yaml': '📋', 'json': '{ }', 'xml': '', 'cert': '🔐', 'archive': '📦', 'script': '⚙️', 'config': '⚙️', 'other': '📄', } return icons.get(file_type, '📄') def scan_directory(base_path): """Scan directory and collect file information.""" files = [] type_counts = {} dir_counts = {} total_size = 0 for root, dirs, filenames in os.walk(base_path): # Skip the _links directory if '_links' in dirs: dirs.remove('_links') for filename in filenames: file_path = os.path.join(root, filename) try: # Get relative path from base_path rel_path = os.path.relpath(file_path, base_path) # Get file info stat_info = os.stat(file_path) size = stat_info.st_size total_size += size # Determine file type file_type = get_file_type(filename) type_counts[file_type] = type_counts.get(file_type, 0) + 1 # Get directory path (everything except filename) dir_path = os.path.dirname(rel_path) # Get top-level directory (first segment after content/) top_level_dir = '' if dir_path.startswith('content/'): path_parts = dir_path.split('/', 2) if len(path_parts) >= 2: top_level_dir = path_parts[1] dir_counts[top_level_dir] = dir_counts.get(top_level_dir, 0) + 1 elif '/' in dir_path: # If not under content/, use first directory top_level_dir = dir_path.split('/', 1)[0] dir_counts[top_level_dir] = dir_counts.get(top_level_dir, 0) + 1 files.append({ 'name': filename, 'path': rel_path, 'dir': dir_path, 'top_level_dir': top_level_dir, 'size': size, 'size_human': human_readable_size(size), 'type': file_type, 'modified': datetime.fromtimestamp(stat_info.st_mtime).isoformat(), }) except Exception as e: print(f"WARNING: Could not process {file_path}: {e}", file=sys.stderr) # Sort files by path files.sort(key=lambda f: f['path']) return files, type_counts, dir_counts, total_size def generate_html_report(files, type_counts, dir_counts, total_size, prowjob_name, build_id, target, gcsweb_url): """Generate an interactive HTML file browser.""" total_files = len(files) total_size_human = human_readable_size(total_size) html = f''' Must-Gather Browser: {build_id}
File Viewer
Height:

Must-Gather File Browser

Prow Job: {prowjob_name}

Build ID: {build_id}

Target: {target}

GCS URL: View in gcsweb

Local Path: .work/prow-job-extract-must-gather/{build_id}/logs/

{total_files:,}
Total Files
{total_size_human}
Total Size
''' # Add stats for each file type for file_type in sorted(type_counts.keys()): count = type_counts[file_type] html += f'''
{count:,}
{file_type}
''' html += '''
''' # Add filter buttons for each type for file_type in sorted(type_counts.keys()): count = type_counts[file_type] html += f''' ''' html += f'''
''' # Add filter buttons for each top-level directory for directory in sorted(dir_counts.keys()): count = dir_counts[directory] # Display name with proper formatting display_name = directory if directory else '(root)' html += f''' ''' html += f'''
Invalid regex pattern
''' # Add file items for file in files: icon = get_file_icon(file['type']) # Use symlink path for iframe if available, otherwise use original iframe_path = file.get('symlink_path', f"logs/{file['path']}") original_path = f"logs/{file['path']}" html += f'''
{icon}
{file['dir']} {file['size_human']} {file['type']}
''' html += f'''
''' return html def create_txt_symlinks(logs_dir, files): """Create .html files with escaped content for files under 1MB to prevent download dialogs.""" MAX_INLINE_SIZE = 1 * 1024 * 1024 # 1MB links_dir = os.path.join(logs_dir, 'content', '_links') # Create _links directory if it doesn't exist os.makedirs(links_dir, exist_ok=True) html_count = 0 for file in files: if file['size'] < MAX_INLINE_SIZE: # Create HTML file with escaped content original_path = os.path.join(logs_dir, file['path']) # Generate unique HTML name by hashing the full path path_hash = hashlib.md5(file['path'].encode()).hexdigest()[:8] html_name = f"{file['name']}.{path_hash}.html" html_path = os.path.join(links_dir, html_name) try: # Read original file content with open(original_path, 'r', encoding='utf-8', errors='replace') as f: content = f.read() # Split into lines and add line numbers lines = content.split('\n') line_count = len(lines) line_number_width = len(str(line_count)) # Build content with line numbers numbered_lines = [] for i, line in enumerate(lines, 1): escaped_line = html_module.escape(line) line_num = str(i).rjust(line_number_width) numbered_lines.append(f'{line_num} {escaped_line}') numbered_content = '\n'.join(numbered_lines) # Wrap in HTML html_content = f''' {html_module.escape(file['name'])}
Invalid regex pattern
{numbered_content}
''' # Write HTML file with open(html_path, 'w', encoding='utf-8') as f: f.write(html_content) # Store HTML path in file metadata file['symlink_path'] = f"logs/content/_links/{html_name}" html_count += 1 except Exception as e: print(f"WARNING: Could not create HTML for {file['path']}: {e}", file=sys.stderr) file['symlink_path'] = None else: file['symlink_path'] = None print(f"Created {html_count:,} .html files for inline viewing") return files def main(): if len(sys.argv) < 6: print("Usage: generate_html_report.py ") sys.exit(1) logs_dir = sys.argv[1] prowjob_name = sys.argv[2] build_id = sys.argv[3] target = sys.argv[4] gcsweb_url = sys.argv[5] # Validate logs directory if not os.path.exists(logs_dir): print(f"ERROR: Logs directory not found: {logs_dir}", file=sys.stderr) sys.exit(1) print("Scanning directory tree...") files, type_counts, dir_counts, total_size = scan_directory(logs_dir) print(f"Found {len(files):,} files ({human_readable_size(total_size)})") print("Creating .html files for inline viewing...") files = create_txt_symlinks(logs_dir, files) print("Generating HTML report...") html = generate_html_report(files, type_counts, dir_counts, total_size, prowjob_name, build_id, target, gcsweb_url) # Determine output path output_dir = os.path.dirname(logs_dir) output_file = os.path.join(output_dir, 'must-gather-browser.html') # Write to file with open(output_file, 'w') as f: f.write(html) print(f"Report generated: {output_file}") if __name__ == '__main__': main()