Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:26:08 +08:00
commit 8f22ddf339
295 changed files with 59710 additions and 0 deletions

View File

@@ -0,0 +1,448 @@
# Telemetry Integration Examples
This document provides practical examples of integrating telemetry tracking into Betty Framework CLI entrypoints, workflows, and agents.
## Quick Start
The Betty Framework provides two main approaches for telemetry integration:
1. **Decorator Pattern** - For CLI entrypoints with standard main() functions
2. **Manual Capture** - For workflows, agents, and custom integrations
## 1. Decorator Pattern (Recommended for CLI)
### Standard CLI with Return Code
For CLI entrypoints that return an exit code:
```python
#!/usr/bin/env python3
import sys
import os
from typing import Optional, List
# Ensure project root on path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
from betty.logging_utils import setup_logger
from betty.telemetry_integration import telemetry_tracked
logger = setup_logger(__name__)
@telemetry_tracked(skill_name="my.skill", caller="cli")
def main(argv: Optional[List[str]] = None) -> int:
"""Entry point for CLI execution."""
argv = argv or sys.argv[1:]
# Your skill logic here
if not argv:
logger.error("Missing required arguments")
return 1
# Process and return success
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
```
**What the decorator does:**
- Automatically measures execution time
- Captures success/failure based on return code (0 = success, non-zero = failed)
- Logs telemetry with sanitized inputs
- Non-blocking - telemetry failures don't affect the main function
### Example: workflow.validate Integration
The `workflow.validate` skill has been updated with telemetry tracking:
```python
# skills/workflow.validate/workflow_validate.py
from betty.telemetry_integration import telemetry_tracked
@telemetry_tracked(skill_name="workflow.validate", caller="cli")
def main(argv: Optional[List[str]] = None) -> int:
"""Entry point for CLI execution."""
# ... existing validation logic ...
return 0 if response["ok"] else 1
```
**Test it:**
```bash
python3 skills/workflow.validate/workflow_validate.py example.yaml
cat registry/telemetry.json
```
## 2. Manual Capture Pattern
### Direct Function Call
For programmatic telemetry capture:
```python
from skills.telemetry.capture.telemetry_capture import create_telemetry_entry, capture_telemetry
import time
# Track execution manually
start_time = time.time()
try:
# Your logic here
result = execute_my_skill(param1, param2)
status = "success"
except Exception as e:
status = "failed"
raise
finally:
duration_ms = int((time.time() - start_time) * 1000)
entry = create_telemetry_entry(
skill_name="my.skill",
inputs={"param1": param1, "param2": param2},
status=status,
duration_ms=duration_ms,
caller="api"
)
capture_telemetry(entry)
```
### Workflow Execution Tracking
For capturing telemetry within workflow execution:
```python
# In workflow.compose or similar workflow executor
from betty.telemetry_integration import track_skill_execution
def execute_workflow_step(step_config: dict, workflow_name: str):
"""Execute a workflow step with telemetry tracking."""
skill_name = step_config["skill"]
skill_args = step_config["args"]
result = track_skill_execution(
skill_name=skill_name,
func=lambda: run_skill(skill_name, skill_args),
inputs={"args": skill_args},
workflow=workflow_name,
caller="workflow"
)
return result
```
### Agent Skill Invocation
For tracking skills invoked by agents:
```python
# In agent skill runner
from betty.telemetry_integration import track_skill_execution
def run_agent_skill(agent_name: str, skill_name: str, **kwargs):
"""Run a skill on behalf of an agent with telemetry."""
result = track_skill_execution(
skill_name=skill_name,
func=lambda: execute_skill(skill_name, **kwargs),
inputs=kwargs,
agent=agent_name,
caller="agent"
)
return result
```
## 3. CLI Helper Function
For simple CLI telemetry without decorators:
```python
from betty.telemetry_integration import capture_cli_telemetry
import time
def main():
start_time = time.time()
status = "failed"
try:
# Your CLI logic
process_command()
status = "success"
return 0
except Exception as e:
logger.error(f"Error: {e}")
return 1
finally:
duration_ms = int((time.time() - start_time) * 1000)
capture_cli_telemetry(
skill_name="my.skill",
inputs={"cli_args": sys.argv[1:]},
status=status,
duration_ms=duration_ms,
caller="cli"
)
```
## 4. Context-Rich Telemetry
### Full Context Example
Capture comprehensive context for deep analytics:
```python
entry = create_telemetry_entry(
skill_name="agent.define",
inputs={
"name": "my-agent",
"mode": "iterative",
"capabilities": ["code-gen", "testing"]
},
status="success",
duration_ms=2500,
agent="meta-orchestrator", # Which agent invoked this
workflow="agent-creation-pipeline", # Which workflow this is part of
caller="api", # How it was invoked
# Custom fields for advanced analytics
user_id="dev-123",
environment="staging",
version="1.0.0"
)
capture_telemetry(entry)
```
## 5. Batch Operations
For tracking multiple skill executions in batch:
```python
from skills.telemetry.capture.telemetry_capture import create_telemetry_entry, capture_telemetry
import time
def process_batch(items: list):
"""Process items in batch with per-item telemetry."""
for item in items:
start_time = time.time()
try:
process_item(item)
status = "success"
except Exception as e:
status = "failed"
duration_ms = int((time.time() - start_time) * 1000)
# Capture telemetry for each item
entry = create_telemetry_entry(
skill_name="batch.processor",
inputs={"item_id": item.id},
status=status,
duration_ms=duration_ms,
caller="batch"
)
capture_telemetry(entry)
```
## 6. Error Handling Best Practices
### Graceful Telemetry Failures
Always wrap telemetry in try-except to prevent failures from affecting main logic:
```python
def my_critical_function():
"""Critical function that must not fail due to telemetry issues."""
start_time = time.time()
result = None
try:
# Critical business logic
result = perform_critical_operation()
status = "success"
except Exception as e:
status = "failed"
raise # Re-raise the original error
finally:
# Capture telemetry (failures are logged but don't interrupt)
try:
duration_ms = int((time.time() - start_time) * 1000)
capture_cli_telemetry(
skill_name="critical.operation",
inputs={},
status=status,
duration_ms=duration_ms
)
except Exception as telemetry_error:
logger.warning(f"Telemetry capture failed: {telemetry_error}")
# Don't raise - telemetry is not critical
return result
```
## 7. Sensitive Data Protection
### Sanitize Inputs
Never capture sensitive data like passwords, tokens, or PII:
```python
def sanitize_inputs(inputs: dict) -> dict:
"""Remove sensitive fields from inputs before logging."""
sensitive_keys = ["password", "token", "api_key", "secret", "ssn", "credit_card"]
sanitized = {}
for key, value in inputs.items():
if any(sensitive in key.lower() for sensitive in sensitive_keys):
sanitized[key] = "***REDACTED***"
else:
sanitized[key] = value
return sanitized
# Use it
entry = create_telemetry_entry(
skill_name="auth.login",
inputs=sanitize_inputs({"username": "john", "password": "secret123"}),
status="success",
duration_ms=150
)
```
## 8. Testing with Telemetry
### Unit Test Example
```python
import unittest
from unittest.mock import patch, MagicMock
class TestMySkillWithTelemetry(unittest.TestCase):
@patch('betty.telemetry_integration.capture_cli_telemetry')
def test_skill_execution_captures_telemetry(self, mock_capture):
"""Test that telemetry is captured on skill execution."""
# Execute the skill
from skills.my.skill.my_skill import main
result = main(["arg1", "arg2"])
# Verify telemetry was captured
self.assertEqual(result, 0)
mock_capture.assert_called_once()
# Verify telemetry parameters
call_args = mock_capture.call_args
self.assertEqual(call_args.kwargs['skill_name'], "my.skill")
self.assertEqual(call_args.kwargs['status'], "success")
self.assertGreater(call_args.kwargs['duration_ms'], 0)
```
## 9. Migration Checklist
To add telemetry to an existing skill:
1. **Import the telemetry integration:**
```python
from betty.telemetry_integration import telemetry_tracked
```
2. **Apply the decorator to main():**
```python
@telemetry_tracked(skill_name="your.skill", caller="cli")
def main(argv: Optional[List[str]] = None) -> int:
```
3. **Ensure main() returns an int:**
- Return 0 for success
- Return non-zero for failure
4. **Test the integration:**
```bash
python3 skills/your.skill/your_skill.py test-args
cat registry/telemetry.json | jq '.[-1]' # View latest entry
```
## 10. Querying Telemetry Data
### Basic Queries
```bash
# Count total telemetry entries
jq 'length' registry/telemetry.json
# Find all failed executions
jq '.[] | select(.status == "failed")' registry/telemetry.json
# Get average duration for a skill
jq '[.[] | select(.skill == "workflow.validate") | .duration_ms] | add / length' registry/telemetry.json
# Top 10 slowest executions
jq 'sort_by(.duration_ms) | reverse | .[0:10]' registry/telemetry.json
# Executions by caller
jq 'group_by(.caller) | map({caller: .[0].caller, count: length})' registry/telemetry.json
```
### Advanced Analytics
```bash
# Skills executed in last hour
jq --arg cutoff "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%S)" \
'.[] | select(.timestamp > $cutoff)' registry/telemetry.json
# Success rate by skill
jq 'group_by(.skill) | map({
skill: .[0].skill,
total: length,
successful: ([.[] | select(.status == "success")] | length),
success_rate: (([.[] | select(.status == "success")] | length) / length * 100)
})' registry/telemetry.json
```
## 11. Future: Prometheus Export
Example of how telemetry data could be exported to Prometheus (future implementation):
```python
# Future: skills/telemetry.capture/exporters/prometheus.py
from prometheus_client import Counter, Histogram, Gauge
skill_executions = Counter(
'betty_skill_executions_total',
'Total skill executions',
['skill', 'status', 'caller']
)
skill_duration = Histogram(
'betty_skill_duration_seconds',
'Skill execution duration',
['skill', 'caller']
)
def export_to_prometheus(telemetry_entry: dict):
"""Export telemetry entry to Prometheus metrics."""
skill_executions.labels(
skill=telemetry_entry['skill'],
status=telemetry_entry['status'],
caller=telemetry_entry.get('caller', 'unknown')
).inc()
skill_duration.labels(
skill=telemetry_entry['skill'],
caller=telemetry_entry.get('caller', 'unknown')
).observe(telemetry_entry['duration_ms'] / 1000.0)
```
## Summary
- **CLI Skills**: Use `@telemetry_tracked` decorator
- **Workflows**: Use `track_skill_execution()` helper
- **Custom Code**: Use `create_telemetry_entry()` + `capture_telemetry()`
- **Always**: Handle telemetry failures gracefully
- **Never**: Capture sensitive data in inputs
For more details, see the [SKILL.md](SKILL.md) documentation.

View File

@@ -0,0 +1,392 @@
# telemetry.capture
**Version:** 0.1.0
**Status:** Active
**Tags:** telemetry, logging, observability, audit
## Overview
The `telemetry.capture` skill provides comprehensive execution logging for Betty Framework components. It captures usage metrics, execution status, timing data, and contextual metadata in a structured, thread-safe manner.
All telemetry data is written to `/registry/telemetry.json` with ISO timestamps and validated JSON schema.
## Features
- Thread-safe JSON logging using file locking (fcntl)
- ISO 8601 timestamp formatting with timezone support
- Structured telemetry entries with validation
- Query interface for telemetry analysis
- Decorator pattern for automatic capture (`@capture_telemetry`)
- Context manager pattern for manual capture
- CLI and programmatic interfaces
- Input sanitization (exclude secrets)
## Purpose
This skill enables:
- **Observability**: Track execution patterns across Betty components
- **Performance Monitoring**: Measure duration of skill executions
- **Error Tracking**: Capture failures with detailed error messages
- **Usage Analytics**: Understand which skills are used most frequently
- **Audit Trail**: Maintain compliance and debugging history
- **Workflow Analysis**: Trace caller chains and dependencies
## Usage
### Basic CLI Usage
```bash
# Capture a successful execution
python skills/telemetry.capture/telemetry_capture.py plugin.build success 320 CLI
# Capture with inputs
python skills/telemetry.capture/telemetry_capture.py \
agent.run success 1500 API '{"agent": "api.designer", "task": "design_api"}'
# Capture a failure
python skills/telemetry.capture/telemetry_capture.py \
workflow.compose failure 2800 CLI '{"workflow": "api_first"}' "Validation failed at step 3"
```
### As a Decorator (Recommended)
```python
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent / "../telemetry.capture"))
from telemetry_utils import capture_telemetry
@capture_telemetry(skill_name="agent.run", caller="CLI", capture_inputs=True)
def run_agent(agent_path: str, task_context: str = None):
"""Execute a Betty agent."""
# ... implementation
return {"status": "success", "result": execution_result}
# Usage
result = run_agent("/agents/api.designer", "Design user authentication API")
# Telemetry is automatically captured
```
### As a Context Manager
```python
from telemetry_utils import TelemetryContext
def build_plugin(plugin_path: str):
with TelemetryContext(skill="plugin.build", caller="CLI") as ctx:
ctx.set_inputs({"plugin_path": plugin_path})
try:
# Perform build operations
result = create_plugin_archive(plugin_path)
ctx.set_status("success")
return result
except Exception as e:
ctx.set_error(str(e))
raise
```
### Programmatic API
```python
from telemetry_capture import TelemetryCapture
telemetry = TelemetryCapture()
# Capture an event
entry = telemetry.capture(
skill="plugin.build",
status="success",
duration_ms=320.5,
caller="CLI",
inputs={"plugin_path": "./plugin.yaml", "output_format": "tar.gz"},
metadata={"user": "developer@example.com", "environment": "production"}
)
print(f"Captured: {entry['timestamp']}")
```
### Query Telemetry Data
```python
from telemetry_capture import TelemetryCapture
telemetry = TelemetryCapture()
# Query recent failures
failures = telemetry.query(status="failure", limit=10)
# Query specific skill usage
agent_runs = telemetry.query(skill="agent.run", limit=50)
# Query by caller
cli_executions = telemetry.query(caller="CLI", limit=100)
for entry in failures:
print(f"{entry['timestamp']}: {entry['skill']} - {entry['error_message']}")
```
## Parameters
### Capture Parameters
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `skill` | string | Yes | Name of the skill/component (e.g., 'plugin.build') |
| `status` | string | Yes | Execution status: success, failure, timeout, error, pending |
| `duration_ms` | number | Yes | Execution duration in milliseconds |
| `caller` | string | Yes | Source of the call (CLI, API, workflow.compose) |
| `inputs` | object | No | Sanitized input parameters (default: {}) |
| `error_message` | string | No | Error message if status is failure/error |
| `metadata` | object | No | Additional context (user, session_id, environment) |
### Decorator Parameters
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `skill_name` | string | function name | Override skill name |
| `caller` | string | "runtime" | Caller identifier |
| `capture_inputs` | boolean | False | Whether to capture function arguments |
| `sanitize_keys` | list | None | Parameter names to redact (e.g., ['password']) |
## Output Format
### Telemetry Entry Structure
```json
{
"timestamp": "2025-10-24T14:30:45.123456+00:00",
"skill": "plugin.build",
"status": "success",
"duration_ms": 320.5,
"caller": "CLI",
"inputs": {
"plugin_path": "./plugin.yaml",
"output_format": "tar.gz"
},
"error_message": null,
"metadata": {
"user": "developer@example.com",
"environment": "production"
}
}
```
### Telemetry File Structure
```json
[
{
"timestamp": "2025-10-24T14:30:45.123456+00:00",
"skill": "plugin.build",
"status": "success",
"duration_ms": 320.5,
"caller": "CLI",
"inputs": {
"plugin_path": "./plugin.yaml",
"output_format": "tar.gz"
},
"error_message": null,
"metadata": {}
},
{
"timestamp": "2025-10-24T14:32:10.789012+00:00",
"skill": "agent.run",
"status": "success",
"duration_ms": 1500.0,
"caller": "API",
"inputs": {
"agent": "api.designer"
},
"error_message": null,
"metadata": {}
}
]
```
Note: The telemetry file is a simple JSON array for efficient querying and compatibility with existing Betty Framework tools.
## Examples
### Example 1: Capture Plugin Build
```bash
python skills/telemetry.capture/telemetry_capture.py \
plugin.build success 320 CLI '{"plugin_path": "./plugin.yaml", "output_format": "tar.gz"}'
```
**Output:**
```json
{
"timestamp": "2025-10-24T14:30:45.123456+00:00",
"skill": "plugin.build",
"status": "success",
"duration_ms": 320.0,
"caller": "CLI",
"inputs": {
"plugin_path": "./plugin.yaml",
"output_format": "tar.gz"
},
"error_message": null,
"metadata": {}
}
Telemetry captured to /home/user/betty/registry/telemetry.json
```
### Example 2: Capture Agent Execution with Decorator
```python
# In skills/agent.run/agent_run.py
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent / "../telemetry.capture"))
from telemetry_utils import capture_telemetry
@capture_telemetry(
skill_name="agent.run",
caller="CLI",
capture_inputs=True,
sanitize_keys=["api_key", "password"]
)
def main():
"""Execute agent with automatic telemetry capture."""
# ... existing implementation
return {"status": "success", "result": result}
if __name__ == "__main__":
main()
```
### Example 3: Query Recent Failures
```python
from telemetry_capture import TelemetryCapture
telemetry = TelemetryCapture()
failures = telemetry.query(status="failure", limit=10)
print("Recent Failures:")
for entry in failures:
print(f" [{entry['timestamp']}] {entry['skill']}")
print(f" Error: {entry['error_message']}")
print(f" Duration: {entry['duration_ms']}ms")
print()
```
## Error Handling
### Invalid Status Value
```python
# Raises ValueError
telemetry.capture(
skill="test.skill",
status="invalid_status", # Must be: success, failure, timeout, error, pending
duration_ms=100,
caller="CLI"
)
```
**Error:** `ValueError: Invalid status: invalid_status. Must be one of: success, failure, timeout, error, pending`
### Malformed Input JSON (CLI)
```bash
python skills/telemetry.capture/telemetry_capture.py \
plugin.build success 320 CLI '{invalid json}'
```
**Error:** `Error: Invalid JSON for inputs: Expecting property name enclosed in double quotes`
### File Locking Contention
The implementation uses `fcntl.flock` for thread-safe writes. If multiple processes write simultaneously:
- Writes are serialized automatically
- No data loss occurs
- Performance may degrade under heavy contention
## Dependencies
This skill has no external dependencies beyond Python standard library:
- `json` - JSON parsing and serialization
- `fcntl` - File locking for thread safety
- `datetime` - ISO 8601 timestamp generation
- `pathlib` - Path handling
- `typing` - Type annotations
## Configuration
### Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| `BETTY_TELEMETRY_FILE` | `/home/user/betty/registry/telemetry.json` | Path to telemetry file |
### Custom Telemetry File
```python
from telemetry_capture import TelemetryCapture
# Use custom location
telemetry = TelemetryCapture(telemetry_file="/custom/path/telemetry.json")
```
## Troubleshooting
### Q: Telemetry file doesn't exist
**A:** The skill automatically creates the telemetry file on first use. Ensure:
- The parent directory exists or can be created
- Write permissions are granted
- Path is absolute or correctly relative
### Q: Decorator not capturing telemetry
**A:** Ensure you:
1. Import the decorator correctly
2. Add the parent path to sys.path if needed
3. Check that the decorated function completes (doesn't hang)
4. Verify file permissions on `/registry/telemetry.json`
### Q: How to exclude sensitive data?
**A:** Use `sanitize_keys` parameter:
```python
@capture_telemetry(
skill_name="auth.login",
capture_inputs=True,
sanitize_keys=["password", "api_key", "secret_token"]
)
def login(username: str, password: str):
# password will be redacted as "***REDACTED***"
pass
```
### Q: Performance impact of telemetry?
**A:** Minimal impact:
- Decorator adds <1ms overhead per call
- File I/O is buffered and atomic
- No network calls
- Consider async writes for high-throughput scenarios
## Integration with Betty Framework
The `telemetry.capture` skill integrates with:
- **agent.run**: Logs agent executions with task context
- **workflow.compose**: Traces multi-step workflow chains
- **plugin.build**: Monitors build performance
- **api.define**: Tracks API creation events
- **skill.define**: Captures skill registration
- **audit.log**: Complements audit trail with performance metrics
All core Betty components should use the `@capture_telemetry` decorator for consistent observability.
## License
Part of the Betty Framework. See repository LICENSE.

View File

@@ -0,0 +1 @@
# Auto-generated package initializer for skills.

View File

@@ -0,0 +1,97 @@
name: telemetry.capture
version: 0.1.0
description: >
Captures and logs usage telemetry for Betty Framework components.
Provides thread-safe JSON logging to /registry/telemetry.json with
ISO timestamps and structured metadata.
inputs:
- name: skill
type: string
required: true
description: Name of the skill/component being logged (e.g., 'plugin.build', 'agent.run')
- name: status
type: string
required: true
description: Execution status (success, failure, timeout, error, pending)
- name: duration_ms
type: number
required: true
description: Execution duration in milliseconds
- name: caller
type: string
required: true
description: Source of the call (e.g., CLI, API, workflow.compose)
- name: inputs
type: object
required: false
default: {}
description: Sanitized input parameters (no secrets)
- name: error_message
type: string
required: false
description: Error message if status is failure/error
- name: metadata
type: object
required: false
default: {}
description: Additional context (user, session_id, environment, etc.)
outputs:
- name: telemetry_entry
type: object
description: The captured telemetry entry with ISO timestamp
- name: telemetry_file
type: string
description: Path to the telemetry.json file
dependencies: []
status: active
entrypoints:
- command: /telemetry/capture
handler: telemetry_capture.py
runtime: python
description: Capture telemetry event for a Betty component
parameters:
- name: skill
type: string
required: true
description: Skill name
- name: status
type: string
required: true
description: Execution status
- name: duration_ms
type: number
required: true
description: Duration in milliseconds
- name: caller
type: string
required: true
description: Caller identifier
- name: inputs
type: string
required: false
description: JSON string of inputs
- name: error_message
type: string
required: false
description: Error message
permissions:
- filesystem:read
- filesystem:write
tags:
- telemetry
- logging
- observability
- audit

View File

@@ -0,0 +1,206 @@
#!/usr/bin/env python3
"""
Betty Framework - Telemetry Capture Skill
Logs usage of core Betty components to /registry/telemetry.json
"""
import json
import os
import sys
import fcntl
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, Optional
class TelemetryCapture:
"""Thread-safe telemetry capture for Betty Framework components."""
def __init__(self, telemetry_file: str = "/home/user/betty/registry/telemetry.json"):
self.telemetry_file = Path(telemetry_file)
self._ensure_telemetry_file()
def _ensure_telemetry_file(self) -> None:
"""Ensure telemetry file exists with valid JSON structure."""
self.telemetry_file.parent.mkdir(parents=True, exist_ok=True)
if not self.telemetry_file.exists():
# Use simple list format to match existing telemetry.json
initial_data = []
with open(self.telemetry_file, 'w') as f:
json.dump(initial_data, f, indent=2)
def capture(
self,
skill: str,
status: str,
duration_ms: float,
caller: str,
inputs: Optional[Dict[str, Any]] = None,
error_message: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
Capture telemetry event and append to telemetry.json (thread-safe).
Args:
skill: Name of the skill/component (e.g., 'plugin.build', 'agent.run')
status: Execution status ('success', 'failure', 'timeout', 'error')
duration_ms: Execution duration in milliseconds
caller: Source of the call (e.g., 'CLI', 'API', 'workflow.compose')
inputs: Input parameters (sanitized, no secrets)
error_message: Error message if status is failure/error
metadata: Additional context (e.g., user, session_id, environment)
Returns:
Dict containing the captured telemetry entry
"""
# Validate inputs
if status not in ['success', 'failure', 'timeout', 'error', 'pending']:
raise ValueError(f"Invalid status: {status}. Must be one of: success, failure, timeout, error, pending")
# Build telemetry entry
entry = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"skill": skill,
"status": status,
"duration_ms": float(duration_ms),
"caller": caller,
"inputs": inputs or {},
"error_message": error_message,
"metadata": metadata or {}
}
# Thread-safe append to telemetry.json
self._append_entry(entry)
return entry
def _append_entry(self, entry: Dict[str, Any]) -> None:
"""
Append entry to telemetry file with file locking for thread safety.
Uses fcntl.flock for POSIX systems to ensure atomic writes.
"""
with open(self.telemetry_file, 'r+') as f:
# Acquire exclusive lock
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
try:
# Read current data (simple list format)
f.seek(0)
data = json.load(f)
# Ensure data is a list
if not isinstance(data, list):
data = []
# Append new entry
data.append(entry)
# Keep last 100,000 entries (safety limit)
if len(data) > 100000:
data = data[-100000:]
# Write back (truncate and rewrite)
f.seek(0)
f.truncate()
json.dump(data, f, indent=2)
f.write('\n') # Add newline at end
finally:
# Release lock
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
def query(
self,
skill: Optional[str] = None,
status: Optional[str] = None,
caller: Optional[str] = None,
limit: Optional[int] = None
) -> list:
"""
Query telemetry entries with optional filters.
Args:
skill: Filter by skill name
status: Filter by status
caller: Filter by caller
limit: Maximum number of entries to return (most recent first)
Returns:
List of matching telemetry entries
"""
with open(self.telemetry_file, 'r') as f:
data = json.load(f)
# Data is a simple list
entries = data if isinstance(data, list) else []
# Apply filters
if skill:
entries = [e for e in entries if e.get("skill") == skill]
if status:
entries = [e for e in entries if e.get("status") == status]
if caller:
entries = [e for e in entries if e.get("caller") == caller]
# Sort by timestamp (most recent first) and limit
entries = sorted(entries, key=lambda e: e.get("timestamp", ""), reverse=True)
if limit:
entries = entries[:limit]
return entries
def main():
"""CLI entrypoint for telemetry capture."""
if len(sys.argv) < 5:
print("Usage: python telemetry_capture.py <skill> <status> <duration_ms> <caller> [inputs_json]")
print("\nExample:")
print(' python telemetry_capture.py plugin.build success 320 CLI')
print(' python telemetry_capture.py agent.run failure 1500 API \'{"agent": "api.designer"}\'')
sys.exit(1)
skill = sys.argv[1]
status = sys.argv[2]
duration_ms = float(sys.argv[3])
caller = sys.argv[4]
# Parse inputs if provided
inputs = {}
if len(sys.argv) > 5:
try:
inputs = json.loads(sys.argv[5])
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON for inputs: {e}", file=sys.stderr)
sys.exit(1)
# Parse error message if provided
error_message = None
if len(sys.argv) > 6:
error_message = sys.argv[6]
# Capture telemetry
try:
telemetry = TelemetryCapture()
entry = telemetry.capture(
skill=skill,
status=status,
duration_ms=duration_ms,
caller=caller,
inputs=inputs,
error_message=error_message
)
print(json.dumps(entry, indent=2))
print(f"\n✓ Telemetry captured to {telemetry.telemetry_file}")
except Exception as e:
print(f"Error capturing telemetry: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,200 @@
#!/usr/bin/env python3
"""
Betty Framework - Telemetry Utilities
Decorator and utilities for automatic execution logging
"""
import functools
import time
import inspect
from typing import Any, Callable, Dict, Optional
from pathlib import Path
from .telemetry_capture import TelemetryCapture
def capture_telemetry(
skill_name: Optional[str] = None,
caller: str = "runtime",
capture_inputs: bool = False,
sanitize_keys: Optional[list] = None
):
"""
Decorator to automatically capture telemetry for function execution.
Usage:
@capture_telemetry(skill_name="agent.run", caller="CLI", capture_inputs=True)
def run_agent(agent_path, task_context=None):
# ... implementation
return result
Args:
skill_name: Name of the skill/component. If None, uses function name.
caller: Source of the call (e.g., 'CLI', 'API', 'workflow')
capture_inputs: Whether to capture function arguments (default: False)
sanitize_keys: List of parameter names to exclude (e.g., ['password', 'api_key'])
The decorated function can:
- Return a dict with 'status' key to set telemetry status
- Raise exceptions (captured as 'error' status)
- Return any value (captured as 'success' status)
"""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args, **kwargs):
# Determine skill name
skill = skill_name or f"{func.__module__}.{func.__name__}"
# Capture inputs if requested
inputs = {}
if capture_inputs:
# Get function signature
sig = inspect.signature(func)
bound_args = sig.bind_partial(*args, **kwargs)
bound_args.apply_defaults()
# Sanitize inputs
inputs = dict(bound_args.arguments)
if sanitize_keys:
for key in sanitize_keys:
if key in inputs:
inputs[key] = "***REDACTED***"
# Start timing
start_time = time.time()
status = "success"
error_message = None
result = None
try:
# Execute function
result = func(*args, **kwargs)
# Check if result contains status
if isinstance(result, dict) and 'status' in result:
status = result['status']
return result
except Exception as e:
status = "error"
error_message = str(e)
raise # Re-raise the exception
finally:
# Calculate duration
duration_ms = (time.time() - start_time) * 1000
# Capture telemetry
try:
telemetry = TelemetryCapture()
telemetry.capture(
skill=skill,
status=status,
duration_ms=duration_ms,
caller=caller,
inputs=inputs,
error_message=error_message
)
except Exception as telemetry_error:
# Don't let telemetry errors break the actual function
print(f"Warning: Telemetry capture failed: {telemetry_error}", file=sys.stderr)
return wrapper
return decorator
class TelemetryContext:
"""
Context manager for capturing telemetry around code blocks.
Usage:
with TelemetryContext(skill="plugin.build", caller="CLI") as ctx:
# ... do work
ctx.set_inputs({"plugin_path": "./plugin.yaml"})
result = build_plugin()
ctx.set_status("success")
"""
def __init__(
self,
skill: str,
caller: str = "runtime",
inputs: Optional[Dict[str, Any]] = None
):
self.skill = skill
self.caller = caller
self.inputs = inputs or {}
self.status = "success"
self.error_message = None
self.start_time = None
self.telemetry = TelemetryCapture()
def __enter__(self):
self.start_time = time.time()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
# Calculate duration
duration_ms = (time.time() - self.start_time) * 1000
# Set status based on exception
if exc_type is not None:
self.status = "error"
self.error_message = str(exc_val)
# Capture telemetry
try:
self.telemetry.capture(
skill=self.skill,
status=self.status,
duration_ms=duration_ms,
caller=self.caller,
inputs=self.inputs,
error_message=self.error_message
)
except Exception as telemetry_error:
print(f"Warning: Telemetry capture failed: {telemetry_error}", file=sys.stderr)
# Don't suppress exceptions
return False
def set_status(self, status: str):
"""Update the execution status."""
self.status = status
def set_inputs(self, inputs: Dict[str, Any]):
"""Update the captured inputs."""
self.inputs.update(inputs)
def set_error(self, error_message: str):
"""Set error message."""
self.error_message = error_message
self.status = "error"
# Example usage for testing
if __name__ == "__main__":
# Test decorator
@capture_telemetry(skill_name="test.function", caller="CLI", capture_inputs=True)
def test_function(name: str, value: int = 10):
"""Test function for telemetry capture."""
print(f"Executing test_function with name={name}, value={value}")
time.sleep(0.1) # Simulate work
return {"status": "success", "result": value * 2}
# Test context manager
def test_context():
"""Test context manager for telemetry capture."""
with TelemetryContext(skill="test.context", caller="CLI") as ctx:
ctx.set_inputs({"operation": "test"})
print("Executing within context")
time.sleep(0.05) # Simulate work
ctx.set_status("success")
print("Testing decorator...")
result = test_function("example", value=42)
print(f"Result: {result}\n")
print("Testing context manager...")
test_context()
print("\nTelemetry tests complete!")