commit a4d116219fe54f1382e1ddeb95064f65e5e44392 Author: Zhongwei Li Date: Sun Nov 30 08:53:24 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..512c3c9 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "breenix-ci", + "description": "GitHub Actions CI/CD workflows: authoring, optimization, and failure analysis for kernel testing", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Ryan Breen", + "email": "ryan@breen.com" + }, + "skills": [ + "./skills/breenix-github-workflow-authoring", + "./skills/breenix-ci-failure-analysis" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..02841f5 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# breenix-ci + +GitHub Actions CI/CD workflows: authoring, optimization, and failure analysis for kernel testing diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..4bff3a6 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,56 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:ryanbreen/breenix:breenix-ci", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "4fbf77cb64088da2d0be0472183a27c1184b1c4a", + "treeHash": "6e4502f37c9d4e5b6c5cca7dbfa0d4a1204487d43c60b2cf9aee32aa84c23fe4", + "generatedAt": "2025-11-28T10:28:05.893797Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "breenix-ci", + "description": "GitHub Actions CI/CD workflows: authoring, optimization, and failure analysis for kernel testing" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "4924792beb3d701f9320c593e9f494e57ddfff4bdf364b4a406db24240969600" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "c97fd10ebe08662e345de002f6429671252197029b1e105eebdda3d476a7ff92" + }, + { + "path": "skills/breenix-ci-failure-analysis/SKILL.md", + "sha256": "922fdca08e129fff349091aff1de310c5044ffa89347469f40e1ae54894c2cfc" + }, + { + "path": "skills/breenix-ci-failure-analysis/scripts/analyze_ci_failure.py", + "sha256": "cf9017475713a717ef83c84967901d561073b7e78f8131fff846a649c67e01da" + }, + { + "path": "skills/breenix-github-workflow-authoring/SKILL.md", + "sha256": "a7dbc5afa19317e2c770cd4e43734ddbe54c8bbe4013d7de0b584ba6b009c5a4" + }, + { + "path": "skills/breenix-github-workflow-authoring/references/breenix-ci-patterns.md", + "sha256": "cd05d03e943442126aa241c260070b29d8ae6e0552d8ac0b08cdeefb2d54d93f" + } + ], + "dirSha256": "6e4502f37c9d4e5b6c5cca7dbfa0d4a1204487d43c60b2cf9aee32aa84c23fe4" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/breenix-ci-failure-analysis/SKILL.md b/skills/breenix-ci-failure-analysis/SKILL.md new file mode 100644 index 0000000..7077d3f --- /dev/null +++ b/skills/breenix-ci-failure-analysis/SKILL.md @@ -0,0 +1,464 @@ +--- +name: ci-failure-analysis +description: This skill should be used when analyzing failed GitHub Actions CI/CD runs for Breenix kernel development. Use for diagnosing test failures, parsing QEMU logs, identifying kernel panics or faults, understanding timeout issues, and determining root causes of CI failures. +--- + +# CI Failure Analysis for Breenix + +Systematically analyze and diagnose CI/CD test failures in Breenix kernel development. + +## Purpose + +This skill provides tools and workflows for analyzing failed CI runs, understanding kernel crashes, identifying environment issues, and determining root causes. It focuses on the unique challenges of kernel development CI: QEMU logs, kernel panics, double faults, page faults, and timeout analysis. + +## When to Use This Skill + +Use this skill when: + +- **CI run fails**: GitHub Actions workflow fails and you need to understand why +- **Test timeout**: Test exceeds time limit and you need to determine if it's a hang or just slow +- **Kernel panic/fault**: Double fault, page fault, or other kernel crash in CI +- **Missing output**: Expected kernel log signals don't appear +- **Environment issues**: Build or dependency problems in CI that don't occur locally +- **Regression analysis**: New PR breaks previously passing tests + +## Quick Start + +When a CI run fails: + +1. **Download artifacts**: Go to failed GitHub Actions run, download log artifacts +2. **Run analyzer**: `ci-failure-analysis/scripts/analyze_ci_failure.py target/xtask_*_output.txt` +3. **Review findings**: Analyzer reports known patterns with diagnosis and fixes +4. **Check context**: Use `--context` flag to see surrounding log lines +5. **Apply fix**: Follow suggested remediation steps + +## Failure Analysis Script + +The skill provides `analyze_ci_failure.py` to automatically detect common failures: + +### Basic Usage + +```bash +# Analyze a CI log file +ci-failure-analysis/scripts/analyze_ci_failure.py target/xtask_ring3_smoke_output.txt + +# Show context around failures +ci-failure-analysis/scripts/analyze_ci_failure.py --context target/xtask_ring3_smoke_output.txt + +# Analyze multiple logs +ci-failure-analysis/scripts/analyze_ci_failure.py target/*.txt logs/breenix_*.log +``` + +### What It Detects + +The analyzer recognizes these failure patterns: + +1. **Double Fault** - Stack corruption, unmapped exception handlers +2. **Page Fault** - Accessing unmapped or incorrectly mapped memory +3. **Test Timeout** - Exceeding time limits +4. **QEMU Not Found** - Missing system dependencies +5. **Rust Target Missing** - Wrong toolchain configuration +6. **rust-src Missing** - Missing required Rust component +7. **Userspace Binary Missing** - Forgetting to build userspace tests +8. **Compilation Error** - Build failures +9. **Signal Not Found** - Expected output missing (test didn't complete) +10. **Kernel Panic** - Unrecoverable errors + +### Output Format + +``` +====================================================================== +CI Failure Analysis: target/xtask_ring3_smoke_output.txt +====================================================================== +Log size: 1523 lines +Patterns detected: 2 + +────────────────────────────────────────────────────────────────────── + +[1] Page Fault + Line 1234: PAGE FAULT at 0x10001082 Error Code: 0x0 + + 📊 Diagnosis: + Page fault accessing unmapped or incorrectly mapped memory + + 🔧 Fix: + Identify the faulting address and check: + 1) Is it mapped in the active page table? + 2) Are the flags correct (USER_ACCESSIBLE, WRITABLE)? + 3) Was it recently unmapped? + + 📄 Context: + 1230: [ INFO] Process created: PID 2 + 1231: [DEBUG] Switching to process page table + 1232: [DEBUG] About to access userspace memory + 1233: [DEBUG] Buffer pointer: 0x10001082 + >>> 1234: PAGE FAULT at 0x10001082 Error Code: 0x0 + 1235: Stack trace: + 1236: 0: copy_from_user + 1237: 1: sys_write + 1238: 2: syscall_handler +``` + +## Common Failure Patterns + +### Double Fault + +**Symptoms**: +``` +DOUBLE FAULT - Error Code: 0x0 +Instruction Pointer: 0x... +Code Segment: ... Ring3 +``` + +**Common Causes**: +1. Kernel stack not mapped in process page table (Ring 3 → Ring 0 transition fails) +2. IST stack misconfigured or unmapped +3. Exception handler itself causes exception +4. Stack overflow + +**Diagnosis**: +- Check if fault occurs during syscall (int 0x80) +- Look for recent page table changes +- Verify TSS RSP0 points to valid kernel stack +- Check IST configuration + +**Fix Examples**: +- Add kernel stack mapping to process page tables +- Verify IST stacks are mapped +- Increase stack size if overflow +- Review exception handler code + +### Page Fault + +**Symptoms**: +``` +PAGE FAULT at 0x... Error Code: 0x... +``` + +**Error Code Decoding**: +- Bit 0 (P): 0 = not present, 1 = protection violation +- Bit 1 (W/R): 0 = read, 1 = write +- Bit 2 (U/S): 0 = kernel, 1 = user +- Bit 3 (RSVD): 1 = reserved bit violation +- Bit 4 (I/D): 1 = instruction fetch + +**Common Causes**: +1. Accessing unmapped memory +2. Writing to read-only page +3. User code accessing kernel page +4. Page table entry missing + +**Diagnosis**: +- Identify faulting address and operation +- Check if address should be mapped +- Verify page table flags (PRESENT, WRITABLE, USER_ACCESSIBLE) +- Look for recent memory operations + +### Test Timeout + +**Symptoms**: +``` +Timeout reached (60s) +... OR ... +Error: test exceeded time limit +``` + +**Distinguishing Hang vs Slow**: + +1. **Kernel hang**: No new output for extended period + - Timer interrupt not firing + - Infinite loop + - Deadlock + +2. **Legitimately slow**: Continuous output, just takes longer + - CI environment slower than local + - Verbose logging enabled + - Many tests in sequence + +**Diagnosis**: +- Check last log message - what was kernel doing? +- Is timer interrupt still firing? (look for timer ticks) +- Are there any locks being acquired? +- Does it complete locally? + +**Fixes**: +- Infinite loop: Add timeout or fix logic +- Deadlock: Review lock acquisition order +- Slow test: Increase timeout or optimize +- Hang: Add debug checkpoints to narrow down location + +### Missing Success Signal + +**Symptoms**: +``` +❌ Ring-3 smoke test failed: no evidence of userspace execution +``` + +**Common Causes**: +1. Test didn't run (compilation failed silently) +2. Kernel panicked before reaching test +3. Test ran but failed assertions +4. Signal string changed but test wasn't updated + +**Diagnosis**: +- Search log for ANY output from the test +- Check if kernel reached test execution point +- Look for earlier errors or panics +- Verify signal string matches test code + +### Compilation Error + +**Symptoms**: +``` +error[E0...]: ... + --> kernel/src/... +``` + +**Common Causes**: +1. Wrong Rust nightly version +2. Missing features +3. Syntax error +4. Dependency version mismatch + +**Diagnosis**: +- Check Rust version in CI vs. expected +- Verify all required crates are available +- Look for changed dependencies +- Check for feature flag mismatches + +### Environment Issues + +**Symptoms**: +``` +qemu-system-x86_64: command not found +... OR ... +error: target 'x86_64-unknown-none' may not be installed +``` + +**Common Causes**: +1. System dependencies not installed +2. Rust components missing +3. Wrong Rust installation method +4. PATH not set correctly + +**Diagnosis**: +- Check workflow YAML for dependency installation +- Verify Rust toolchain setup +- Check for typos in package names +- Confirm correct ubuntu version + +## Analysis Workflow + +### Step 1: Identify Failure Type + +1. **Download artifacts** from failed GitHub Actions run +2. **Check Actions summary** for which step failed +3. **Determine failure category**: + - Build failure (compilation) + - Environment setup failure (missing deps) + - Test execution failure (kernel crash, timeout, wrong output) + +### Step 2: Automated Analysis + +```bash +# Run the analyzer on downloaded logs +ci-failure-analysis/scripts/analyze_ci_failure.py \ + --context \ + target/xtask_*_output.txt +``` + +Review the output for: +- Detected patterns +- Suggested diagnosis +- Recommended fixes + +### Step 3: Manual Analysis + +If automated analysis doesn't find clear patterns: + +```bash +# Search for specific error keywords +grep -i "error\|panic\|fault\|timeout" target/xtask_*_output.txt + +# Find last successful operation +grep "SUCCESS\|✓\|✅" target/xtask_*_output.txt | tail -20 + +# Look for specific subsystem activity +grep "memory\|page table\|process\|syscall" target/xtask_*_output.txt +``` + +### Step 4: Reproduce Locally + +```bash +# Run exact same command as CI +cargo run -p xtask -- ring3-smoke + +# Or use quick debug for faster iteration +kernel-debug-loop/scripts/quick_debug.py --signal "EXPECTED_SIGNAL" --timeout 30 +``` + +### Step 5: Compare Environments + +| Aspect | Local | CI | +|--------|-------|-----| +| Rust version | Check with `rustc --version` | Check workflow YAML | +| QEMU version | `qemu-system-x86_64 --version` | ubuntu-latest package | +| Timeout | Usually 30s | Usually 60s | +| Build cache | Warm | Cold or partial | +| System load | Low | Variable | + +### Step 6: Root Cause Analysis + +Document findings using the systematic debugging pattern: + +1. **Problem**: What failed? +2. **Root Cause**: Why did it fail? +3. **Solution**: What fixes it? +4. **Evidence**: How do you know it's fixed? + +## Integration with Other Skills + +### Use with kernel-debug-loop + +After identifying a failure, use `kernel-debug-loop` for rapid iteration: + +```bash +# Test fix with quick feedback +kernel-debug-loop/scripts/quick_debug.py \ + --signal "🎯 KERNEL_POST_TESTS_COMPLETE 🎯" \ + --timeout 15 +``` + +### Use with github-workflow-authoring + +Fix workflow issues: + +```bash +# If environment issue detected: +# 1. Identify missing dependency from analyzer output +# 2. Update workflow using github-workflow-authoring skill +# 3. Test change in PR +``` + +### Use with systematic-debugging + +Document the failure: + +```markdown +# Problem +CI run #123 failed with page fault at 0x10001082 + +# Root Cause +[Fill in after analysis] + +# Solution +[Fill in after fix] + +# Evidence +[Fill in after verification] +``` + +## Advanced Techniques + +### Diff Analysis + +Compare working vs broken runs: + +```bash +# Download logs from last successful run and failed run +diff -u successful_run.txt failed_run.txt | less +``` + +Look for: +- First point where outputs diverge +- Missing initialization steps +- Different memory addresses (ASLR not implemented, so addresses should match) + +### Timeline Reconstruction + +Find the last known-good state: + +```bash +grep -n "SUCCESS\|COMPLETE\|initialized" target/xtask_*_output.txt | tail -20 +``` + +This shows what completed before the failure. + +### Iterative Binary Search + +If failure point unclear: + +1. Add checkpoint log in middle of suspect region +2. Rebuild and retest +3. Narrow down based on whether checkpoint reached +4. Repeat until failure location isolated + +### Statistical Analysis + +For intermittent failures: + +```bash +# Run test 10 times, count failures +for i in {1..10}; do + cargo run -p xtask -- ring3-smoke && echo "PASS" || echo "FAIL" +done | sort | uniq -c +``` + +## Best Practices + +1. **Always download logs**: Don't rely on Actions UI truncation +2. **Check multiple logs**: Compile errors vs runtime errors vs test output +3. **Compare with local**: Reproduce failures locally when possible +4. **Search for first error**: Often followed by cascading failures +5. **Check recent changes**: What changed between last working and first broken run? +6. **Verify environment**: Toolchain versions, dependencies, configurations +7. **Document patterns**: Add new patterns to analyzer when discovered +8. **Test fixes**: Verify fix locally before pushing to CI + +## Example Analysis Session + +```bash +# 1. Download artifact from failed CI run +# Save to: target/xtask_ring3_smoke_output.txt + +# 2. Run automated analysis +ci-failure-analysis/scripts/analyze_ci_failure.py \ + --context target/xtask_ring3_smoke_output.txt + +# Output shows: Page Fault at 0x10001082 + +# 3. Search for context +grep -B10 -A10 "0x10001082" target/xtask_ring3_smoke_output.txt + +# 4. Identify: copy_from_user failing + +# 5. Check if this address is mapped +grep "process page table\|mapping" target/xtask_ring3_smoke_output.txt + +# 6. Hypothesis: User buffer not mapped in process page table + +# 7. Review recent changes to process memory code + +# 8. Identify fix needed + +# 9. Test locally with quick iteration +kernel-debug-loop/scripts/quick_debug.py \ + --signal "USERSPACE OUTPUT" \ + --timeout 10 + +# 10. Verify fix works + +# 11. Push to PR, monitor CI +``` + +## Summary + +CI failure analysis for Breenix requires: +- Automated pattern detection for common failures +- Manual log analysis for novel issues +- Environment comparison (local vs CI) +- Systematic root cause investigation +- Integration with debugging and testing workflows +- Documentation of findings + +The `analyze_ci_failure.py` script automates common pattern detection, but kernel debugging ultimately requires understanding the code, memory management, interrupt handling, and the specific feature being tested. diff --git a/skills/breenix-ci-failure-analysis/scripts/analyze_ci_failure.py b/skills/breenix-ci-failure-analysis/scripts/analyze_ci_failure.py new file mode 100755 index 0000000..4eb48be --- /dev/null +++ b/skills/breenix-ci-failure-analysis/scripts/analyze_ci_failure.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +""" +Analyze CI failure logs to identify root causes. + +This script helps diagnose common Breenix CI failures by parsing log files +and looking for known failure patterns. +""" + +import argparse +import re +import sys +from pathlib import Path +from typing import List, Tuple, Optional + + +class FailurePattern: + """Represents a known failure pattern with diagnosis info.""" + def __init__(self, name: str, pattern: str, diagnosis: str, fix: str, is_regex: bool = False): + self.name = name + self.pattern = pattern if not is_regex else re.compile(pattern) + self.diagnosis = diagnosis + self.fix = fix + self.is_regex = is_regex + + def matches(self, line: str) -> bool: + """Check if this pattern matches the line.""" + if self.is_regex: + return self.pattern.search(line) is not None + return self.pattern in line + + +# Known failure patterns +FAILURE_PATTERNS = [ + FailurePattern( + "Double Fault", + r"DOUBLE FAULT.*Error Code: (0x[0-9a-fA-F]+)", + "Kernel encountered a double fault - usually indicates stack corruption, unmapped memory access during exception handling, or page table issues", + "Check: 1) Kernel stack mapping in process page tables 2) IST stack configuration 3) Page table entry flags 4) Recent memory management changes", + is_regex=True + ), + FailurePattern( + "Page Fault", + r"PAGE FAULT.*at (0x[0-9a-fA-F]+).*Error Code: (0x[0-9a-fA-F]+)", + "Page fault accessing unmapped or incorrectly mapped memory", + "Identify the faulting address and check: 1) Is it mapped in the active page table? 2) Are the flags correct (USER_ACCESSIBLE, WRITABLE)? 3) Was it recently unmapped?", + is_regex=True + ), + FailurePattern( + "Test Timeout", + "Timeout", + "Test exceeded time limit - could be kernel hang, infinite loop, or test too slow for CI", + "Check: 1) Does test complete locally? 2) Are there any infinite loops? 3) Is timer interrupt working? 4) Increase timeout if legitimately slow", + is_regex=False + ), + FailurePattern( + "QEMU Not Found", + "qemu-system-x86_64: command not found", + "QEMU not installed in CI environment", + "Add 'qemu-system-x86' to system dependencies in GitHub workflow", + is_regex=False + ), + FailurePattern( + "Rust Target Missing", + "error: target 'x86_64-unknown-none' may not be installed", + "Custom kernel target not available", + "Add 'target: x86_64-unknown-none' to Rust toolchain installation step", + is_regex=False + ), + FailurePattern( + "rust-src Missing", + "error: could not compile `bootloader`", + "Missing rust-src component required for no_std builds", + "Add 'rust-src' to components list in Rust toolchain setup", + is_regex=False + ), + FailurePattern( + "Userspace Binary Missing", + r"Userspace binary not found|Error loading ELF", + "Userspace test binary not built before kernel test", + "Add userspace build step before kernel test: 'cd userspace/tests && ./build.sh'", + is_regex=True + ), + FailurePattern( + "Compilation Error", + r"error(?:\[E\d+\])?:", + "Rust compilation failed", + "Check: 1) Correct Rust nightly version 2) All required features enabled 3) No syntax errors 4) Dependencies available", + is_regex=True + ), + FailurePattern( + "Signal Not Found", + "no evidence of userspace execution", + "Expected kernel log signal not found in output - test did not complete successfully", + "Check: 1) Does kernel boot at all? 2) Does it reach the expected checkpoint? 3) Is the signal string correct? 4) Was test code executed?", + is_regex=False + ), + FailurePattern( + "Kernel Panic", + "PANIC", + "Kernel panic - unrecoverable error", + "Read panic message for specific cause. Common: assertion failure, unwrap() on None, index out of bounds, explicit panic!()", + is_regex=False + ), +] + + +def find_patterns(log_content: str) -> List[Tuple[FailurePattern, str, int]]: + """Find all matching failure patterns in the log.""" + matches = [] + lines = log_content.split('\n') + + for i, line in enumerate(lines, 1): + for pattern in FAILURE_PATTERNS: + if pattern.matches(line): + matches.append((pattern, line, i)) + + return matches + + +def extract_context(log_content: str, line_num: int, context: int = 5) -> str: + """Extract lines around a specific line number.""" + lines = log_content.split('\n') + start = max(0, line_num - context - 1) + end = min(len(lines), line_num + context) + + context_lines = [] + for i in range(start, end): + prefix = ">>> " if i == line_num - 1 else " " + context_lines.append(f"{prefix}{i+1:5d}: {lines[i]}") + + return '\n'.join(context_lines) + + +def analyze_log_file(log_file: Path, verbose: bool = False) -> dict: + """Analyze a log file and return findings.""" + try: + log_content = log_file.read_text() + except Exception as e: + return {"error": f"Failed to read log file: {e}"} + + matches = find_patterns(log_content) + + # Deduplicate by pattern name + unique_patterns = {} + for pattern, line, line_num in matches: + if pattern.name not in unique_patterns: + unique_patterns[pattern.name] = (pattern, line, line_num) + + return { + "log_file": str(log_file), + "total_lines": len(log_content.split('\n')), + "patterns_found": len(unique_patterns), + "matches": unique_patterns, + "log_content": log_content if verbose else None, + } + + +def print_analysis(analysis: dict, show_context: bool = False): + """Print analysis results in a readable format.""" + if "error" in analysis: + print(f"❌ Error: {analysis['error']}", file=sys.stderr) + return + + print(f"\n{'='*70}") + print(f"CI Failure Analysis: {analysis['log_file']}") + print(f"{'='*70}") + print(f"Log size: {analysis['total_lines']} lines") + print(f"Patterns detected: {analysis['patterns_found']}") + print() + + if analysis['patterns_found'] == 0: + print("✓ No known failure patterns detected") + print(" This might be:") + print(" - A novel failure not yet cataloged") + print(" - A timeout without specific error") + print(" - A test that failed silently") + print("\n Manual analysis recommended:") + print(" 1. Search for 'ERROR', 'FAIL', 'panic', 'fault' in logs") + print(" 2. Check if expected success signals appear") + print(" 3. Look for the last successful operation before hang/crash") + return + + print(f"{'─'*70}") + + for i, (name, (pattern, line, line_num)) in enumerate(analysis['matches'].items(), 1): + print(f"\n[{i}] {name}") + print(f" Line {line_num}: {line.strip()}") + print(f"\n 📊 Diagnosis:") + for diag_line in pattern.diagnosis.split('\n'): + print(f" {diag_line}") + print(f"\n 🔧 Fix:") + for fix_line in pattern.fix.split('\n'): + print(f" {fix_line}") + + if show_context and analysis['log_content']: + print(f"\n 📄 Context:") + context = extract_context(analysis['log_content'], line_num) + for ctx_line in context.split('\n'): + print(f" {ctx_line}") + + print(f"\n {'─'*66}") + + print(f"\n{'='*70}") + + +def main(): + parser = argparse.ArgumentParser( + description='Analyze Breenix CI failure logs', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Analyze CI artifact log + %(prog)s target/xtask_ring3_smoke_output.txt + + # Show context around failures + %(prog)s --context target/xtask_ring3_smoke_output.txt + + # Analyze multiple logs + %(prog)s target/*.txt + + # Find logs directory + %(prog)s logs/breenix_*.log + """ + ) + parser.add_argument( + 'log_files', + nargs='+', + type=Path, + help='Log files to analyze' + ) + parser.add_argument( + '--context', + action='store_true', + help='Show context lines around failures' + ) + parser.add_argument( + '--verbose', + action='store_true', + help='Verbose output' + ) + + args = parser.parse_args() + + # Analyze each log file + for log_file in args.log_files: + if not log_file.exists(): + print(f"❌ File not found: {log_file}", file=sys.stderr) + continue + + analysis = analyze_log_file(log_file, verbose=args.verbose) + print_analysis(analysis, show_context=args.context) + print() + + +if __name__ == '__main__': + main() diff --git a/skills/breenix-github-workflow-authoring/SKILL.md b/skills/breenix-github-workflow-authoring/SKILL.md new file mode 100644 index 0000000..d761365 --- /dev/null +++ b/skills/breenix-github-workflow-authoring/SKILL.md @@ -0,0 +1,457 @@ +--- +name: github-workflow-authoring +description: This skill should be used when creating or improving GitHub Actions CI/CD workflows for Breenix kernel development. Use for authoring new test workflows, optimizing existing CI pipelines, adding new test types, fixing workflow configuration issues, or adapting workflows for new kernel features. +--- + +# GitHub Workflow Authoring for Breenix + +Create and improve GitHub Actions workflows for Breenix OS kernel development and testing. + +## Purpose + +This skill provides patterns, templates, and best practices for authoring GitHub Actions workflows specifically for Breenix kernel development. It addresses the unique challenges of OS kernel CI/CD: QEMU virtualization, custom Rust targets, userspace binary building, timeout management, and kernel-specific test patterns. + +## When to Use This Skill + +Use this skill when: + +- **Creating new test workflows**: Adding CI for new kernel features or test suites +- **Optimizing CI performance**: Reducing build times, improving caching, tuning timeouts +- **Fixing CI failures**: Workflow configuration issues, missing dependencies, wrong environment +- **Adapting workflows**: Modifying workflows for new kernel capabilities or test requirements +- **Debugging CI issues**: Understanding why workflows fail, reproducing issues locally +- **Adding test coverage**: Expanding CI to cover more kernel subsystems or scenarios + +## Key Breenix CI Patterns + +### Rust Toolchain Requirements + +Breenix requires specific Rust configuration: + +```yaml +- name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: nightly-2025-06-24 # Pinned for consistency + override: true + target: x86_64-unknown-none # Custom kernel target + components: rust-src, llvm-tools-preview +``` + +**Critical**: The Rust nightly version is pinned to avoid unexpected breakage from compiler changes. + +### System Dependencies + +All kernel tests require QEMU and supporting tools: + +```yaml +- name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + qemu-system-x86 \ + qemu-utils \ + ovmf \ + mtools \ + dosfstools \ + xorriso \ + nasm \ + lld +``` + +### Userspace Binary Building + +**CRITICAL**: Before running kernel tests that execute userspace code, userspace binaries must be built: + +```yaml +- name: Build userspace tests + run: | + export PATH="$PATH:$(rustc --print sysroot)/lib/rustlib/x86_64-unknown-linux-gnu/bin" + cd userspace/tests + ./build.sh +``` + +Forgetting this step causes kernel tests to fail mysteriously! + +### Using xtask for Tests + +Breenix uses the `xtask` pattern for complex test workflows: + +```yaml +- name: Run Ring-3 smoke test + run: cargo run -p xtask -- ring3-smoke +``` + +This handles: +- Building the kernel with correct features +- Starting QEMU with appropriate flags +- Monitoring serial output for success signals +- Timeout management (30s local, 60s CI) +- Cleanup and artifact collection + +## Timeout Strategies + +Different workflows need different timeouts: + +```yaml +jobs: + quick-test: + timeout-minutes: 20 # Build + simple smoke test + + full-integration: + timeout-minutes: 45 # Complete test suite with shared QEMU + + code-quality: + timeout-minutes: 15 # Clippy and static analysis +``` + +**Rule of thumb**: CI environments are 2-3x slower than local development machines. Budget accordingly. + +## Caching for Performance + +Proper caching reduces build times from ~10 minutes to ~2 minutes: + +```yaml +- name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} +``` + +Cache invalidates when `Cargo.lock` changes, ensuring fresh builds for dependency updates. + +## Log Artifact Upload + +Always upload logs, especially on failure: + +```yaml +- name: Upload logs + if: always() # Run even if previous steps failed + uses: actions/upload-artifact@v4 + with: + name: breenix-logs + path: | + logs/*.log + target/xtask_ring3_smoke_output.txt + target/xtask_ring3_enosys_output.txt + if-no-files-found: ignore + retention-days: 7 +``` + +This enables post-mortem analysis of failed runs. + +## Workflow Patterns + +### Pattern 1: Smoke Test (Fast Feedback) + +Runs on every push to provide quick feedback: + +```yaml +name: Ring-3 Smoke Test + +on: + push: + branches: [ "**" ] + pull_request: + +jobs: + ring3-smoke: + runs-on: ubuntu-latest + timeout-minutes: 20 + + steps: + - uses: actions/checkout@v4 + - name: Install Rust + # ... (see above) + - name: Cache cargo + # ... (see above) + - name: Install system dependencies + # ... (see above) + - name: Build userspace tests + # ... (see above) + - name: Run smoke test + run: cargo run -p xtask -- ring3-smoke + - name: Upload logs + if: always() + # ... (see above) +``` + +**Purpose**: Verify basic kernel functionality (boot, userspace execution) quickly. + +### Pattern 2: Code Quality (Static Analysis) + +Runs on kernel code changes: + +```yaml +name: Code Quality + +on: + push: + paths: + - 'kernel/**' + - '.github/workflows/code-quality.yml' + +jobs: + clippy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Rust + with: + components: clippy, rust-src + - name: Run Clippy + run: | + cd kernel + cargo clippy --target x86_64-unknown-none \ + -- -Dclippy::debug_assert_with_mut_call +``` + +**Purpose**: Catch code quality issues before they reach main branch. + +### Pattern 3: Manual Trigger (Expensive Tests) + +For tests that take too long to run on every commit: + +```yaml +name: Full Integration Tests + +on: + workflow_dispatch: # Only run manually + +jobs: + integration: + timeout-minutes: 45 + # ... full test suite +``` + +**Purpose**: Comprehensive testing before releases or major merges. + +## Reference Files + +The skill includes reference material in the `references/` directory: + +- **`breenix-ci-patterns.md`**: Comprehensive CI patterns, timeout strategies, caching, success signals + +To reference these during workflow authoring: +```bash +cat github-workflow-authoring/references/breenix-ci-patterns.md +``` + +## Workflow Creation Process + +When creating a new workflow: + +1. **Identify the test type**: Smoke test, integration test, static analysis, etc. +2. **Determine trigger**: Every push, PR only, manual, or path-specific +3. **Set appropriate timeout**: Based on test complexity and CI overhead +4. **Copy template from existing workflow**: Start with `ring3-smoke.yml` or `code-quality.yml` +5. **Customize steps**: Add specific build steps, test commands, or checks +6. **Add caching**: Use cargo cache unless testing cache-sensitive issues +7. **Configure artifact upload**: Always include logs for debugging +8. **Test locally first**: Use the same commands that will run in CI +9. **Add to PR**: Update .github/workflows/ and create PR for review +10. **Monitor first runs**: Watch for environment-specific issues + +## Common Workflow Issues and Fixes + +### Issue: "rustup: command not found" +**Cause**: Rust toolchain not installed or wrong action version +**Fix**: Use `actions-rs/toolchain@v1` or `dtolnay/rust-toolchain` + +### Issue: "error: target 'x86_64-unknown-none' may not be installed" +**Cause**: Missing custom target +**Fix**: Add `target: x86_64-unknown-none` to toolchain setup + +### Issue: "error: could not compile `bootloader`" +**Cause**: Missing `rust-src` component +**Fix**: Add `rust-src` to components list + +### Issue: "qemu-system-x86_64: command not found" +**Cause**: QEMU not installed in CI environment +**Fix**: Add qemu-system-x86 to apt-get install list + +### Issue: Test times out but works locally +**Cause**: CI environment slower, or test hung +**Fix**: Increase timeout-minutes or investigate kernel hang + +### Issue: Cache seems corrupted +**Cause**: Cache key collision or partial build artifacts +**Fix**: Change cache key (add version suffix) or clear cache + +### Issue: Userspace test fails with "file not found" +**Cause**: Userspace binaries not built before kernel test +**Fix**: Add userspace build step before kernel test runs + +## Advanced Patterns + +### Matrix Builds (Future) + +Test multiple configurations: + +```yaml +strategy: + matrix: + mode: [uefi, bios] + features: [testing, production] + +steps: + - run: cargo run --bin qemu-${{ matrix.mode }} --features ${{ matrix.features }} +``` + +### Conditional Steps + +Skip steps based on conditions: + +```yaml +- name: Upload logs + if: failure() # Only on failure + # or + if: always() # Always run + # or + if: success() # Only on success +``` + +### Environment-Specific Behavior + +```yaml +env: + CI: true + RUST_BACKTRACE: full + BREENIX_TIMEOUT: 60 # Used by xtask +``` + +## Best Practices + +1. **Pin Rust version**: Avoid unexpected breakage from nightly changes +2. **Cache aggressively**: Cargo builds are slow, caching saves 5-8 minutes +3. **Fail fast**: Set reasonable timeouts to avoid wasting CI minutes +4. **Upload artifacts**: Always capture logs for post-mortem analysis +5. **Test locally first**: Run the exact commands that CI will run +6. **Use xtask**: Complex test logic belongs in xtask, not YAML +7. **Monitor CI time**: If workflows exceed 20-30 minutes, consider splitting +8. **Document workflows**: Add comments explaining non-obvious steps + +## Integration with Breenix Development + +When adding new kernel features that require CI testing: + +1. **Identify test requirements**: What needs to be verified? +2. **Create or extend xtask**: Add new test command (e.g., `ring3-fork-test`) +3. **Add workflow**: Either extend existing or create new workflow file +4. **Add success signal**: Add kernel log marker for test completion +5. **Update workflow docs**: Document the new test in CLAUDE.md or README + +## Example: Adding a New Test Workflow + +Let's say you want to add CI for testing the fork() syscall: + +```yaml +name: Fork System Call Test + +on: + push: + paths: + - 'kernel/src/process/**' + - 'kernel/src/syscall/**' + - 'userspace/tests/fork_test*' + +jobs: + fork-test: + runs-on: ubuntu-latest + timeout-minutes: 25 + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: nightly-2025-06-24 + override: true + target: x86_64-unknown-none + components: rust-src, llvm-tools-preview + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: cargo-${{ runner.os }}-fork-${{ hashFiles('**/Cargo.lock') }} + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y qemu-system-x86 ovmf nasm + + - name: Build userspace tests + run: | + export PATH="$PATH:$(rustc --print sysroot)/lib/rustlib/x86_64-unknown-linux-gnu/bin" + cd userspace/tests + ./build.sh + + - name: Run fork test + run: cargo run -p xtask -- fork-test + + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: fork-test-logs + path: | + logs/*.log + target/xtask_fork_test_output.txt + retention-days: 7 +``` + +Then update `xtask/src/main.rs`: + +```rust +#[derive(StructOpt)] +enum Cmd { + Ring3Smoke, + Ring3Enosys, + ForkTest, // New test +} + +fn main() -> Result<()> { + match Cmd::from_args() { + Cmd::Ring3Smoke => ring3_smoke(), + Cmd::Ring3Enosys => ring3_enosys(), + Cmd::ForkTest => fork_test(), + } +} + +fn fork_test() -> Result<()> { + // Similar to ring3_smoke but looks for fork-specific signals + // ... +} +``` + +## Troubleshooting CI Failures + +When a workflow fails: + +1. **Download artifacts**: Get the log files from the Actions run +2. **Search for errors**: Look for panic, double fault, timeout messages +3. **Compare with local**: Run the exact same command locally +4. **Check environment**: Verify Rust version, QEMU version, dependencies +5. **Reproduce in clean environment**: Use Docker or fresh VM if needed +6. **Use ci-failure-analysis skill**: Systematic analysis of CI failures + +## Summary + +GitHub workflow authoring for Breenix requires understanding: +- Rust nightly toolchain with custom targets +- QEMU-based kernel testing patterns +- xtask for test orchestration +- Timeout management for CI environments +- Caching strategies for performance +- Log artifact collection for debugging + +Always reference the existing workflows as templates, test locally before committing, and leverage the xtask pattern for complex test logic. diff --git a/skills/breenix-github-workflow-authoring/references/breenix-ci-patterns.md b/skills/breenix-github-workflow-authoring/references/breenix-ci-patterns.md new file mode 100644 index 0000000..141a619 --- /dev/null +++ b/skills/breenix-github-workflow-authoring/references/breenix-ci-patterns.md @@ -0,0 +1,240 @@ +# Breenix CI/CD Patterns Reference + +## Core Requirements + +### Rust Toolchain +- **Nightly version**: `nightly-2025-06-24` (pinned for consistency) +- **Components**: `rust-src`, `llvm-tools-preview` +- **Optional**: `clippy` for code quality checks +- **Target**: `x86_64-unknown-none` for kernel builds + +### System Dependencies +```bash +sudo apt-get update +sudo apt-get install -y \ + qemu-system-x86 \ + qemu-utils \ + ovmf \ + mtools \ + dosfstools \ + xorriso \ + nasm \ + lld +``` + +### Build Tools +- **llvm-tools**: Add to PATH: `$(rustc --print sysroot)/lib/rustlib/x86_64-unknown-linux-gnu/bin` +- **ld.lld**: Required for linking userspace binaries + +## Breenix-Specific Build Patterns + +### Userspace Binary Building +Must be done before kernel tests that execute userspace code: + +```bash +export PATH="$PATH:$(rustc --print sysroot)/lib/rustlib/x86_64-unknown-linux-gnu/bin" +cd userspace/tests +./build.sh +``` + +### Kernel Build with Testing Features +```bash +cargo run --release --features testing --bin qemu-uefi -- -serial stdio -display none +``` + +### Using xtask for Tests +Breenix uses the `xtask` pattern for complex test workflows: + +```bash +cargo run -p xtask -- ring3-smoke +cargo run -p xtask -- ring3-enosys +``` + +## Timeout Strategies + +### Typical Timeouts +- **Code quality checks**: 10-15 minutes +- **Build + simple tests**: 20 minutes +- **Full integration tests**: 45 minutes +- **CI environment**: Add 2-3x overhead for slower runners + +### QEMU Execution Timeouts +- **Local**: 30 seconds for smoke tests +- **CI**: 60 seconds (logs are verbose, builds are slower) +- **File creation**: 30s local, 300s (5 min) CI + +## Caching Strategies + +### Cargo Cache +```yaml +- uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} +``` + +### Benefits +- Reduces build time from ~10 minutes to ~2 minutes +- Especially important for dependency-heavy builds +- Invalidates on Cargo.lock changes + +## Test Execution Patterns + +### Serial Output to File +```yaml +- name: Run test + run: cargo run -p xtask -- ring3-smoke +``` + +Internally, xtask uses: +```rust +-serial file:target/xtask_ring3_smoke_output.txt +``` + +### Log Artifact Upload +```yaml +- name: Upload logs + if: always() # Run even on failure + uses: actions/upload-artifact@v4 + with: + name: breenix-logs + path: | + logs/*.log + target/xtask_ring3_smoke_output.txt + if-no-files-found: ignore + retention-days: 7 +``` + +## Success Detection Patterns + +### Looking for Signals in Output +```rust +// Check for userspace execution evidence +if contents.contains("USERSPACE OUTPUT: Hello from userspace") || + (contents.contains("Context switch: from_userspace=true, CS=0x33") && + contents.contains("restore_userspace_thread_context: Restoring thread")) +``` + +### Common Success Signals +- `USERSPACE OUTPUT: Hello from userspace` +- `🎯 KERNEL_POST_TESTS_COMPLETE 🎯` +- `Context switch: from_userspace=true, CS=0x33` +- `✅ SUCCESS: Userspace syscall completed` + +### Common Failure Patterns +- `DOUBLE FAULT` - Page table or stack issues +- `PAGE FAULT` - Memory mapping problems +- `Timeout` - Test hung or infinite loop +- Build errors - Missing dependencies or wrong toolchain + +## Workflow Triggers + +### Current Patterns +```yaml +# Run on all branches +on: + push: + branches: [ "**" ] + pull_request: + +# Run only manually (for expensive tests) +on: + workflow_dispatch: + +# Run on specific paths +on: + push: + paths: + - 'kernel/**' + - '.github/workflows/code-quality.yml' +``` + +## Environment Variables + +### Useful Vars +```yaml +env: + RUST_BACKTRACE: full + CARGO_UNSTABLE_BINDEPS: true # For build dependencies + CI: true # Detection for different timeouts +``` + +## Common Workflow Structure + +```yaml +name: Test Name +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-22.04 + timeout-minutes: 20 + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: nightly-2025-06-24 + override: true + target: x86_64-unknown-none + components: rust-src, llvm-tools-preview + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y qemu-system-x86 ovmf nasm + + - name: Build userspace tests + run: | + export PATH="$PATH:$(rustc --print sysroot)/lib/rustlib/x86_64-unknown-linux-gnu/bin" + cd userspace/tests + ./build.sh + + - name: Run test + run: cargo run -p xtask -- ring3-smoke + + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-logs + path: | + logs/*.log + target/xtask_*_output.txt + retention-days: 7 +``` + +## Debugging Failed CI + +### Step 1: Check Logs +Download artifacts from the Actions run, look for: +- Compilation errors (wrong Rust version, missing components) +- Missing system dependencies (QEMU, OVMF, etc.) +- Timeout vs actual kernel panic + +### Step 2: Reproduce Locally +```bash +# Use exact same commands as CI +cargo run -p xtask -- ring3-smoke +``` + +### Step 3: Common Fixes +- **Rust version mismatch**: Update toolchain specification +- **Missing QEMU**: Add to system dependencies +- **Timeout**: Increase timeout or optimize test +- **Userspace not built**: Add userspace build step before kernel test +- **Cache corruption**: Clear cache or change cache key