Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:51:24 +08:00
commit af30225584
13 changed files with 2319 additions and 0 deletions

135
agent/.env.example Normal file
View File

@@ -0,0 +1,135 @@
# ADK Orchestrator Environment Configuration
# Copy to .env and fill in actual values
# Google Cloud Configuration
PROJECT_ID=your-project-id
LOCATION=us-central1
REGION=us-central1
# Vertex AI Configuration
VERTEX_AI_ENDPOINT=https://us-central1-aiplatform.googleapis.com
VERTEX_AI_API_VERSION=v1
# Agent Engine Configuration
AGENT_ENGINE_ENDPOINT=https://agent-engine.googleapis.com
AGENT_REGISTRY_URL=https://agent-engine.googleapis.com/v1/agents
# Memory Bank Configuration
MEMORY_BANK_CORPUS=adk-orchestrator-memory
MEMORY_BANK_TTL_DAYS=14
# Session Service Configuration
SESSION_SERVICE_TYPE=vertex-ai
SESSION_TTL_DAYS=30
SESSION_AUTO_SAVE=true
# Model Configuration
MODEL_NAME=models/gemini-2.0-flash-exp
MODEL_TEMPERATURE=0.7
MODEL_MAX_TOKENS=8192
MODEL_TOP_P=0.95
MODEL_TOP_K=40
# A2A Protocol Configuration
A2A_ENABLED=true
A2A_DISCOVERY_REFRESH_MINUTES=5
A2A_MAX_CONCURRENT_INVOCATIONS=50
A2A_DEFAULT_TIMEOUT_SECONDS=30
# Monitoring & Observability
ENABLE_TRACING=true
ENABLE_METRICS=true
ENABLE_LOGGING=true
LOG_LEVEL=INFO
TRACE_SAMPLING_RATE=0.1
# Security
ENABLE_AUTH=true
AUTH_TYPE=oauth2
OAUTH_CLIENT_ID=your-client-id
OAUTH_CLIENT_SECRET=your-client-secret
API_KEY=your-api-key
# Rate Limiting
RATE_LIMIT_ENABLED=true
RATE_LIMIT_RPM=1000
RATE_LIMIT_BURST=100
# Circuit Breaker
CIRCUIT_BREAKER_ENABLED=true
CIRCUIT_BREAKER_THRESHOLD=5
CIRCUIT_BREAKER_TIMEOUT=30
CIRCUIT_BREAKER_RESET_TIMEOUT=60
# Performance Tuning
MAX_WORKERS=10
CONNECTION_POOL_SIZE=20
REQUEST_TIMEOUT=300
KEEPALIVE_TIMEOUT=600
# Development Settings
DEBUG=false
DEVELOPMENT_MODE=false
HOT_RELOAD=false
# Testing Configuration
TEST_MODE=false
TEST_PROJECT_ID=test-project
TEST_MOCK_AGENTS=false
# Deployment Configuration
DEPLOYMENT_ENV=production
DEPLOYMENT_VERSION=1.0.0
DEPLOYMENT_REVISION=1
# Service Account (for local development)
GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account-key.json
# Additional GCP Services
ENABLE_CLOUD_STORAGE=true
STORAGE_BUCKET=adk-orchestrator-storage
ENABLE_BIGQUERY=true
BIGQUERY_DATASET=adk_orchestrator
# Compliance
COMPLIANCE_MODE=R5
DATA_RETENTION_DAYS=14
ENABLE_AUDIT_LOG=true
AUDIT_LOG_BUCKET=adk-orchestrator-audit
# Feature Flags
ENABLE_PARALLEL_WORKFLOWS=true
ENABLE_LOOP_WORKFLOWS=true
ENABLE_TEAM_CREATION=true
ENABLE_AUTO_RECOVERY=true
ENABLE_HEALTH_MONITORING=true
# External Integrations
SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL
PAGERDUTY_API_KEY=your-pagerduty-key
DATADOG_API_KEY=your-datadog-key
# Cache Configuration
CACHE_ENABLED=true
CACHE_TTL_SECONDS=3600
CACHE_MAX_SIZE_MB=512
# Database (if using Cloud SQL)
DATABASE_ENABLED=false
DATABASE_HOST=
DATABASE_PORT=5432
DATABASE_NAME=adk_orchestrator
DATABASE_USER=
DATABASE_PASSWORD=
# Redis (if using Memorystore)
REDIS_ENABLED=false
REDIS_HOST=
REDIS_PORT=6379
REDIS_PASSWORD=
# Custom Settings
CUSTOM_AGENT_TIMEOUT_MULTIPLIER=1.5
CUSTOM_RETRY_ATTEMPTS=3
CUSTOM_BACKOFF_BASE=2
CUSTOM_MAX_BACKOFF=60

12
agent/__init__.py Normal file
View File

@@ -0,0 +1,12 @@
# Copyright 2025 Jeremy Longshore
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
"""ADK Orchestrator Agent - A2A protocol manager for Vertex AI Engine"""
from .agent import get_agent, create_runner, root_agent
__all__ = ["get_agent", "create_runner", "root_agent"]
__version__ = "1.0.0"

147
agent/agent.py Normal file
View File

@@ -0,0 +1,147 @@
# Copyright 2025 Jeremy Longshore
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ADK Orchestrator Agent - Production-ready A2A protocol manager for Vertex AI Engine"""
from typing import Optional, List, Dict, Any
from google.adk.agents import LlmAgent
from google.adk.tools import FunctionTool
from google.adk.runners import Runner
from google.adk.sessions import VertexAiSessionService
from google.adk.memory import VertexAiMemoryBankService
from google.adk.artifacts import InMemoryArtifactService
from google.adk.auth.credential_service import InMemoryCredentialService
from .tools import (
discover_agents,
invoke_agent,
manage_agent_session,
validate_agent_card,
deploy_to_vertex_engine,
monitor_agent_health,
create_agent_team,
coordinate_workflow
)
def get_agent() -> LlmAgent:
"""Returns the ADK Orchestrator agent configured for A2A protocol management.
This agent specializes in:
- Agent discovery via AgentCards
- A2A protocol implementation
- Multi-agent coordination
- Vertex AI Engine deployment
- Session and memory management
- Production monitoring
"""
# Load system prompt from file
with open('system-prompt.md', 'r') as f:
system_instruction = f.read()
return LlmAgent(
name="adk-orchestrator",
model="models/gemini-2.0-flash-exp", # Latest Gemini for orchestration
description="Production ADK orchestrator for A2A protocol and multi-agent coordination",
instruction=system_instruction,
tools=[
# Agent Discovery & Management
FunctionTool(discover_agents),
FunctionTool(invoke_agent),
FunctionTool(validate_agent_card),
# Session & Memory Management
FunctionTool(manage_agent_session),
# Deployment & Operations
FunctionTool(deploy_to_vertex_engine),
FunctionTool(monitor_agent_health),
# Multi-Agent Coordination
FunctionTool(create_agent_team),
FunctionTool(coordinate_workflow),
],
# Enable features for production
enable_parallel_tool_calls=True,
enable_code_execution=True,
context_window_size=2_000_000, # 2M token context for Gemini 2.0
output_key="orchestration_result",
metadata={
"version": "1.0.0",
"deployment_target": "vertex-ai-engine",
"capabilities": ["a2a", "multi-agent", "session-management", "monitoring"],
"compliance": "R5-ready"
}
)
async def create_runner() -> Runner:
"""Creates a production-ready runner with dual memory (Session + Memory Bank).
This configuration provides:
- VertexAiSessionService for conversation state
- VertexAiMemoryBankService for long-term memory (14-day TTL)
- Auto-save callback for R5 compliance
- Proper resource management
"""
# Initialize services
session_service = VertexAiSessionService(
project_id="your-project-id", # Will be configured via env
location="us-central1",
session_ttl_days=30
)
memory_service = VertexAiMemoryBankService(
project_id="your-project-id",
location="us-central1",
corpus_name="adk-orchestrator-memory",
ttl_days=14 # R5 compliance
)
# Create runner with production configuration
return Runner(
app_name="adk-orchestrator",
agent=get_agent(),
session_service=session_service,
memory_service=memory_service,
artifact_service=InMemoryArtifactService(),
credential_service=InMemoryCredentialService(),
# Auto-save session to memory for R5 compliance
callbacks={
"after_session": auto_save_session_to_memory
}
)
async def auto_save_session_to_memory(session, memory_service):
"""Callback to automatically save session to memory bank after each interaction.
This ensures R5 compliance by persisting all session data to long-term memory.
"""
if session and memory_service:
await memory_service.save_session(
session_id=session.id,
session_data=session.to_dict(),
metadata={
"timestamp": session.updated_at,
"agent": "adk-orchestrator",
"compliance": "R5"
}
)
# Export for ADK CLI
root_agent = get_agent()

179
agent/agent_card.yaml Normal file
View File

@@ -0,0 +1,179 @@
# Agent Card for A2A Protocol Discovery
# Compliant with A2A Specification v1.0
name: adk-orchestrator
description: |
Production-grade ADK orchestrator specializing in Agent-to-Agent (A2A) protocol
management, multi-agent coordination, and Vertex AI Engine deployment. Provides
comprehensive agent discovery, invocation, session management, and workflow
orchestration capabilities.
version: 1.0.0
url: https://agent-engine.googleapis.com/v1/agents/adk-orchestrator
# Agent Provider Information
provider:
name: Jeremy Longshore
organization: Claude Code Plugins
contact: jeremy@example.com
# Agent Capabilities
capabilities:
# Core Capabilities
a2a_protocol: true
multi_agent_coordination: true
session_management: true
memory_bank: true
vertex_deployment: true
# Workflow Patterns
sequential_workflows: true
parallel_workflows: true
loop_workflows: true
# Advanced Features
agent_discovery: true
health_monitoring: true
auto_recovery: true
compliance: ["R5"]
# Performance
max_concurrent_agents: 100
max_workflow_depth: 10
timeout_seconds: 300
# Agent Skills (What this agent can do)
skills:
- id: discover-agents
name: Agent Discovery
description: Discovers and validates available agents via A2A protocol
input_modes: ["application/json"]
output_modes: ["application/json"]
tags: ["discovery", "a2a", "registry"]
- id: invoke-agent
name: Agent Invocation
description: Invokes specific agents with proper A2A protocol handling
input_modes: ["application/json"]
output_modes: ["application/json"]
tags: ["invocation", "a2a", "execution"]
- id: manage-session
name: Session Management
description: Creates and manages stateful agent sessions
input_modes: ["application/json"]
output_modes: ["application/json"]
tags: ["session", "state", "memory"]
- id: validate-card
name: Card Validation
description: Validates agent cards against A2A specification
input_modes: ["application/json"]
output_modes: ["application/json"]
tags: ["validation", "a2a", "compliance"]
- id: deploy-vertex
name: Vertex Deployment
description: Deploys agents to Vertex AI Engine
input_modes: ["application/json"]
output_modes: ["application/json"]
tags: ["deployment", "vertex", "gcp"]
- id: monitor-health
name: Health Monitoring
description: Monitors agent health and performance metrics
input_modes: ["application/json"]
output_modes: ["application/json"]
tags: ["monitoring", "health", "metrics"]
- id: create-team
name: Team Creation
description: Creates coordinated agent teams for complex tasks
input_modes: ["application/json"]
output_modes: ["application/json"]
tags: ["team", "coordination", "multi-agent"]
- id: coordinate-workflow
name: Workflow Coordination
description: Orchestrates multi-agent workflows (sequential, parallel, loop)
input_modes: ["application/json"]
output_modes: ["application/json"]
tags: ["workflow", "orchestration", "patterns"]
# Input/Output Configuration
default_input_modes: ["application/json", "text/plain"]
default_output_modes: ["application/json", "text/plain"]
# Security Configuration
security_schemes:
bearer:
type: http
scheme: bearer
description: OAuth 2.0 Bearer Token
api_key:
type: apiKey
in: header
name: X-API-Key
description: API Key authentication
# Authentication Support
supports_authenticated_extended_card: true
# Operational Configuration
operational:
# Endpoints
health_endpoint: /health
metrics_endpoint: /metrics
# Rate Limiting
rate_limit:
requests_per_minute: 1000
burst_size: 100
# Timeouts
default_timeout_seconds: 30
max_timeout_seconds: 300
# Resource Limits
max_memory_mb: 2048
max_cpu_cores: 4
# Compliance & Standards
compliance:
- standard: R5
description: "Compliant with R5 data retention and session management"
- standard: SOC2
description: "Follows SOC2 security and availability principles"
- standard: GDPR
description: "GDPR-compliant data handling and privacy"
# Dependencies (Other agents this agent may invoke)
dependencies:
- name: "*"
description: "Can discover and invoke any A2A-compliant agent"
optional: true
# Monitoring & Observability
observability:
tracing: true
metrics: true
logging: true
trace_endpoint: https://trace.googleapis.com
metrics_endpoint: https://monitoring.googleapis.com
log_endpoint: https://logging.googleapis.com
# Metadata
metadata:
created_at: "2025-11-19T00:00:00Z"
updated_at: "2025-11-19T00:00:00Z"
documentation: https://github.com/jeremylongshore/claude-code-plugins
repository: https://github.com/jeremylongshore/claude-code-plugins
license: Apache-2.0
tags:
- adk
- orchestrator
- a2a
- vertex-ai
- multi-agent
- production

220
agent/deploy.yaml Normal file
View File

@@ -0,0 +1,220 @@
# Vertex AI Engine Deployment Configuration
# For production deployment of ADK Orchestrator
apiVersion: agents.vertex.ai/v1
kind: AgentDeployment
metadata:
name: adk-orchestrator
labels:
app: adk-orchestrator
version: "1.0.0"
environment: production
team: platform
spec:
# Agent Configuration
agent:
source: ./
entrypoint: agent.root_agent
runtime: python310
# Resource Configuration
resources:
cpu: 4
memory: 8Gi
gpu: 0 # No GPU required for orchestration
# Scaling Configuration
replicas:
min: 2
max: 10
target_cpu_utilization: 70
# Environment Variables
env:
- name: PROJECT_ID
valueFrom:
configMapRef:
name: gcp-config
key: project_id
- name: LOCATION
value: us-central1
- name: LOG_LEVEL
value: INFO
- name: ENABLE_TRACING
value: "true"
- name: ENABLE_METRICS
value: "true"
# Service Configuration
service:
type: LoadBalancer
port: 8080
targetPort: 8080
annotations:
cloud.google.com/neg: '{"ingress": true}'
cloud.google.com/backend-config: '{"ports": {"8080":"adk-orchestrator-backend"}}'
# Health Checks
healthCheck:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 3
# Liveness Probe
livenessProbe:
httpGet:
path: /health/live
port: 8080
initialDelaySeconds: 45
periodSeconds: 10
# Readiness Probe
readinessProbe:
httpGet:
path: /health/ready
port: 8080
initialDelaySeconds: 30
periodSeconds: 5
# Network Policy
networkPolicy:
ingress:
- from:
- namespaceSelector:
matchLabels:
name: agent-engine
ports:
- protocol: TCP
port: 8080
# Security Context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
allowPrivilegeEscalation: false
# Service Account
serviceAccount:
name: adk-orchestrator-sa
annotations:
iam.gke.io/gcp-service-account: adk-orchestrator@PROJECT_ID.iam.gserviceaccount.com
# Monitoring
monitoring:
enabled: true
prometheus:
enabled: true
port: 9090
stackdriver:
enabled: true
projectId: ${PROJECT_ID}
# Logging
logging:
level: INFO
format: json
stackdriver:
enabled: true
# Tracing
tracing:
enabled: true
sampling_rate: 0.1
exporter: stackdriver
# Secrets
secrets:
- name: api-keys
mountPath: /etc/secrets/api-keys
- name: oauth-config
mountPath: /etc/secrets/oauth
# Config Maps
configMaps:
- name: agent-config
mountPath: /etc/config/agent
- name: a2a-registry
mountPath: /etc/config/registry
# Persistent Storage (for session data)
storage:
- name: session-data
size: 10Gi
storageClass: standard-rwo
mountPath: /data/sessions
# Memory Bank Configuration
memoryBank:
enabled: true
corpus: adk-orchestrator-memory
ttl_days: 14
search_index: semantic
# Session Service Configuration
sessionService:
type: vertex-ai
ttl_days: 30
auto_save: true
# A2A Configuration
a2a:
enabled: true
discovery_endpoint: https://agent-engine.googleapis.com/v1/agents
registry_refresh_minutes: 5
max_concurrent_invocations: 50
# Circuit Breaker
circuitBreaker:
enabled: true
threshold: 5
timeout: 30s
resetTimeout: 60s
# Rate Limiting
rateLimiting:
enabled: true
requests_per_minute: 1000
burst_size: 100
# Deployment Strategy
strategy:
type: RollingUpdate
maxSurge: 1
maxUnavailable: 0
# PodDisruptionBudget
podDisruptionBudget:
minAvailable: 1
# Annotations
annotations:
deployment.vertex.ai/revision: "1"
deployment.vertex.ai/managed-by: "adk-cli"
---
# Backend Configuration for Cloud Load Balancer
apiVersion: cloud.google.com/v1
kind: BackendConfig
metadata:
name: adk-orchestrator-backend
spec:
timeoutSec: 300
connectionDraining:
drainingTimeoutSec: 60
healthCheck:
checkIntervalSec: 10
timeoutSec: 5
healthyThreshold: 1
unhealthyThreshold: 3
type: HTTP
requestPath: /health
port: 8080
sessionAffinity:
affinityType: "CLIENT_IP"
affinityCookieTtlSec: 3600

33
agent/requirements.txt Normal file
View File

@@ -0,0 +1,33 @@
# ADK Orchestrator Agent Dependencies
# Python 3.10+ required
# Core ADK SDK
google-adk>=1.15.1
# Google Cloud dependencies
google-cloud-aiplatform>=1.73.0
google-cloud-logging>=3.11.0
google-cloud-monitoring>=2.15.0
google-cloud-trace>=1.13.0
# A2A Protocol support
a2a>=0.1.0 # When available
# Async HTTP client
httpx>=0.27.0
# Data validation
pydantic>=2.9.0
# Utilities
python-dotenv>=1.0.1
structlog>=24.4.0
tenacity>=9.0.0 # For retry logic
# Development dependencies
pytest>=8.3.0
pytest-asyncio>=0.24.0
pytest-cov>=6.0.0
black>=24.10.0
isort>=5.13.0
mypy>=1.11.0

194
agent/system-prompt.md Normal file
View File

@@ -0,0 +1,194 @@
# ADK Orchestrator System Prompt
You are the ADK Orchestrator, a production-grade agent specializing in Agent-to-Agent (A2A) protocol management and multi-agent coordination for Vertex AI Engine deployments.
## Core Responsibilities
### 1. Agent Discovery & Registration
- Discover available agents via AgentCard protocol
- Validate agent capabilities and compatibility
- Maintain registry of active agents
- Monitor agent health and availability
### 2. A2A Protocol Management
- Implement full A2A protocol specification
- Handle agent-to-agent communication
- Manage authentication and authorization
- Coordinate request/response patterns
### 3. Multi-Agent Orchestration
- Create and manage agent teams
- Coordinate Sequential workflows
- Coordinate Parallel workflows
- Implement Loop patterns for iterative tasks
- Handle error recovery and retries
### 4. Session & Memory Management
- Manage agent sessions with VertexAiSessionService
- Persist context in VertexAiMemoryBankService
- Implement auto-save for R5 compliance
- Handle session recovery and migration
### 5. Vertex AI Engine Deployment
- Deploy agents to Vertex AI Engine
- Configure scaling and resource allocation
- Set up monitoring and alerting
- Manage production rollouts
## Operational Guidelines
### Discovery Process
When discovering agents:
1. Check for AgentCard at standard endpoints
2. Validate card schema and required fields
3. Test agent connectivity and response
4. Register in active agent pool
5. Set up health monitoring
### Invocation Protocol
When invoking agents:
1. Validate request against agent capabilities
2. Set up session context
3. Handle authentication if required
4. Execute request with timeout
5. Process response and handle errors
6. Update session state
### Coordination Patterns
#### Sequential Workflow
```
Agent A → Agent B → Agent C
Each agent completes before next starts
```
#### Parallel Workflow
```
→ Agent A →
Start → Agent B → Merge
→ Agent C →
All agents run simultaneously
```
#### Loop Workflow
```
Start → Agent A → Condition → (repeat or exit)
Iterate until condition met
```
### Memory Management
- Save sessions after each interaction
- Index by agent, timestamp, and task
- Implement 14-day TTL for compliance
- Enable semantic search across memories
- Support memory-based agent selection
### Production Standards
- All operations must be idempotent
- Implement circuit breakers for failing agents
- Log all interactions for audit trail
- Monitor latency and error rates
- Support graceful degradation
## Error Handling
### Agent Failures
- Retry with exponential backoff (max 3 attempts)
- Fall back to alternative agents if available
- Log failure details for debugging
- Alert on repeated failures
### Network Issues
- Implement request timeout (30s default)
- Handle partial responses
- Queue requests during outages
- Provide status updates to users
### Data Validation
- Validate all inputs and outputs
- Sanitize data before passing between agents
- Check response schemas
- Handle malformed responses gracefully
## Security Requirements
### Authentication
- Validate agent credentials
- Implement OAuth 2.0 flows
- Support service account authentication
- Manage token refresh
### Authorization
- Check agent permissions
- Implement role-based access
- Audit all authorization decisions
- Support policy-based controls
### Data Protection
- Encrypt sensitive data in transit
- Implement PII detection and masking
- Support data residency requirements
- Enable audit logging
## Performance Targets
- Agent discovery: < 1 second
- Agent invocation: < 5 seconds (excluding agent processing)
- Session save: < 500ms
- Memory search: < 1 second
- Health check: < 100ms
## Monitoring & Alerting
Track and alert on:
- Agent availability (< 99.9% triggers alert)
- Response times (p99 > 10s triggers alert)
- Error rates (> 1% triggers alert)
- Memory usage (> 80% triggers alert)
- Session failures (any failure triggers alert)
## Compliance Requirements
### R5 Compliance
- Auto-save all sessions to memory
- Maintain 14-day data retention
- Implement proper data deletion
- Support compliance audits
### Logging Standards
- Structured JSON logging
- Include correlation IDs
- Log at appropriate levels
- Support log aggregation
## Best Practices
1. **Always validate before invoking** - Check agent capabilities match request
2. **Use appropriate coordination pattern** - Sequential for dependent, Parallel for independent
3. **Implement proper error handling** - Never fail silently
4. **Monitor continuously** - Track all metrics in production
5. **Document decisions** - Log why specific agents or patterns were chosen
6. **Optimize for latency** - Cache agent cards, reuse sessions
7. **Plan for scale** - Design for 1000+ agent invocations per minute
## Response Format
When responding to orchestration requests, always provide:
```json
{
"status": "success|partial|failure",
"agents_invoked": ["agent1", "agent2"],
"coordination_pattern": "sequential|parallel|loop",
"results": {
"agent1": { ... },
"agent2": { ... }
},
"session_id": "uuid",
"memory_saved": true,
"latency_ms": 1234,
"errors": []
}
```
Remember: You are the conductor of the agent orchestra. Ensure harmony, handle discord, and deliver a perfect performance every time.

513
agent/tools.py Normal file
View File

@@ -0,0 +1,513 @@
# Copyright 2025 Jeremy Longshore
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
"""Tools for ADK Orchestrator Agent - A2A protocol and coordination functions"""
import asyncio
import json
from typing import Dict, List, Optional, Any, Union
from datetime import datetime
import httpx
from pydantic import BaseModel, Field
# Pydantic models for structured data
class AgentCard(BaseModel):
"""Agent Card for A2A protocol discovery"""
name: str
description: str
url: str
version: str = "1.0.0"
capabilities: Dict[str, Any] = Field(default_factory=dict)
skills: List[Dict[str, Any]] = Field(default_factory=list)
input_modes: List[str] = Field(default_factory=lambda: ["text/plain"])
output_modes: List[str] = Field(default_factory=lambda: ["text/plain"])
class AgentInvocation(BaseModel):
"""Request structure for agent invocation"""
agent_name: str
input_data: Dict[str, Any]
timeout_seconds: int = 30
session_id: Optional[str] = None
auth_token: Optional[str] = None
class WorkflowConfig(BaseModel):
"""Configuration for multi-agent workflows"""
pattern: str # "sequential", "parallel", "loop"
agents: List[str]
max_iterations: int = 10
timeout_seconds: int = 300
error_strategy: str = "fail_fast" # or "continue_on_error"
# Tool Functions
async def discover_agents(
registry_url: Optional[str] = None,
filter_capabilities: Optional[List[str]] = None
) -> Dict[str, Any]:
"""Discovers available agents via A2A protocol.
Args:
registry_url: Optional registry endpoint (defaults to Vertex AI Engine registry)
filter_capabilities: Optional list of required capabilities
Returns:
Dictionary containing discovered agents and their cards
"""
try:
# Default to Vertex AI Engine agent registry
if not registry_url:
registry_url = "https://agent-engine.googleapis.com/v1/agents"
discovered_agents = []
# In production, this would make actual HTTP requests
# For now, return example structure
async with httpx.AsyncClient() as client:
# Discover agents from registry
# response = await client.get(registry_url)
# agents = response.json()
# Example agents (would come from actual discovery)
example_agents = [
{
"name": "data-analyst",
"description": "Analyzes data and generates insights",
"url": "https://agent-engine.googleapis.com/v1/agents/data-analyst",
"capabilities": ["sql", "visualization", "statistics"],
},
{
"name": "code-generator",
"description": "Generates code in multiple languages",
"url": "https://agent-engine.googleapis.com/v1/agents/code-generator",
"capabilities": ["python", "javascript", "sql"],
}
]
# Filter by capabilities if specified
for agent in example_agents:
if filter_capabilities:
if any(cap in agent.get("capabilities", []) for cap in filter_capabilities):
discovered_agents.append(agent)
else:
discovered_agents.append(agent)
return {
"status": "success",
"discovered_count": len(discovered_agents),
"agents": discovered_agents,
"registry": registry_url,
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
return {
"status": "error",
"error": str(e),
"discovered_count": 0,
"agents": []
}
async def invoke_agent(
invocation: AgentInvocation,
retry_count: int = 3
) -> Dict[str, Any]:
"""Invokes a specific agent via A2A protocol.
Args:
invocation: Agent invocation configuration
retry_count: Number of retry attempts
Returns:
Agent response including results and metadata
"""
try:
# Construct A2A request
a2a_request = {
"jsonrpc": "2.0",
"method": "agent.invoke",
"params": {
"input": invocation.input_data,
"session_id": invocation.session_id
},
"id": f"req-{datetime.utcnow().timestamp()}"
}
# In production, make actual A2A protocol request
async with httpx.AsyncClient() as client:
# response = await client.post(
# f"{agent_url}/a2a",
# json=a2a_request,
# timeout=invocation.timeout_seconds,
# headers={"Authorization": f"Bearer {invocation.auth_token}"}
# )
# Example response
response_data = {
"jsonrpc": "2.0",
"result": {
"output": f"Processed request for {invocation.agent_name}",
"metadata": {
"processing_time_ms": 1234,
"tokens_used": 567
}
},
"id": a2a_request["id"]
}
return {
"status": "success",
"agent": invocation.agent_name,
"result": response_data["result"],
"session_id": invocation.session_id,
"timestamp": datetime.utcnow().isoformat()
}
except asyncio.TimeoutError:
return {
"status": "timeout",
"agent": invocation.agent_name,
"error": f"Agent invocation timed out after {invocation.timeout_seconds}s",
"session_id": invocation.session_id
}
except Exception as e:
return {
"status": "error",
"agent": invocation.agent_name,
"error": str(e),
"session_id": invocation.session_id
}
async def manage_agent_session(
action: str, # "create", "get", "update", "delete"
session_id: Optional[str] = None,
session_data: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""Manages agent sessions for stateful interactions.
Args:
action: Session action to perform
session_id: Session identifier
session_data: Session data to store/update
Returns:
Session information and status
"""
try:
if action == "create":
# Create new session
new_session_id = f"session-{datetime.utcnow().timestamp()}"
return {
"status": "success",
"action": "created",
"session_id": new_session_id,
"created_at": datetime.utcnow().isoformat()
}
elif action == "get":
# Retrieve session
return {
"status": "success",
"action": "retrieved",
"session_id": session_id,
"data": session_data or {},
"retrieved_at": datetime.utcnow().isoformat()
}
elif action == "update":
# Update session
return {
"status": "success",
"action": "updated",
"session_id": session_id,
"updated_at": datetime.utcnow().isoformat()
}
elif action == "delete":
# Delete session
return {
"status": "success",
"action": "deleted",
"session_id": session_id,
"deleted_at": datetime.utcnow().isoformat()
}
else:
return {
"status": "error",
"error": f"Unknown action: {action}"
}
except Exception as e:
return {
"status": "error",
"action": action,
"error": str(e)
}
async def validate_agent_card(
agent_url: str,
strict_mode: bool = True
) -> Dict[str, Any]:
"""Validates an agent's card against A2A specification.
Args:
agent_url: URL to fetch agent card
strict_mode: Whether to enforce strict validation
Returns:
Validation results and agent card if valid
"""
try:
async with httpx.AsyncClient() as client:
# Fetch agent card
# response = await client.get(f"{agent_url}/agent-card")
# card_data = response.json()
# Example validation
card_data = {
"name": "example-agent",
"description": "An example agent",
"url": agent_url,
"version": "1.0.0",
"capabilities": {"nlp": True, "code": True}
}
# Validate using Pydantic
card = AgentCard(**card_data)
return {
"status": "valid",
"agent_card": card.model_dump(),
"validation_mode": "strict" if strict_mode else "lenient",
"validated_at": datetime.utcnow().isoformat()
}
except Exception as e:
return {
"status": "invalid",
"error": str(e),
"agent_url": agent_url
}
async def deploy_to_vertex_engine(
agent_name: str,
project_id: str,
location: str = "us-central1",
config: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""Deploys an agent to Vertex AI Engine.
Args:
agent_name: Name of agent to deploy
project_id: GCP project ID
location: Deployment location
config: Deployment configuration
Returns:
Deployment status and endpoint information
"""
try:
deployment_config = config or {
"machine_type": "n1-standard-4",
"replica_count": 2,
"auto_scaling": True
}
# In production, use Vertex AI SDK
# from google.cloud import aiplatform
# aiplatform.init(project=project_id, location=location)
# endpoint = aiplatform.Endpoint.create(...)
return {
"status": "deployed",
"agent": agent_name,
"project": project_id,
"location": location,
"endpoint": f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/endpoints/{agent_name}",
"config": deployment_config,
"deployed_at": datetime.utcnow().isoformat()
}
except Exception as e:
return {
"status": "deployment_failed",
"agent": agent_name,
"error": str(e)
}
async def monitor_agent_health(
agent_names: List[str],
include_metrics: bool = True
) -> Dict[str, Any]:
"""Monitors health and metrics for deployed agents.
Args:
agent_names: List of agents to monitor
include_metrics: Whether to include detailed metrics
Returns:
Health status and metrics for each agent
"""
try:
health_results = {}
for agent_name in agent_names:
# In production, query actual health endpoints
health_results[agent_name] = {
"status": "healthy",
"availability": 99.95,
"response_time_ms": 234,
"error_rate": 0.001
}
if include_metrics:
health_results[agent_name]["metrics"] = {
"requests_per_minute": 120,
"tokens_per_request": 450,
"cache_hit_rate": 0.85,
"memory_usage_mb": 512
}
return {
"status": "success",
"timestamp": datetime.utcnow().isoformat(),
"agents": health_results,
"summary": {
"total_agents": len(agent_names),
"healthy": len([h for h in health_results.values() if h["status"] == "healthy"]),
"unhealthy": len([h for h in health_results.values() if h["status"] != "healthy"])
}
}
except Exception as e:
return {
"status": "error",
"error": str(e)
}
async def create_agent_team(
team_name: str,
agent_roles: Dict[str, str],
coordination_rules: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""Creates a team of agents for collaborative tasks.
Args:
team_name: Name for the agent team
agent_roles: Mapping of agents to their roles
coordination_rules: Rules for agent coordination
Returns:
Team configuration and status
"""
try:
team_config = {
"name": team_name,
"agents": agent_roles,
"coordination": coordination_rules or {
"decision_maker": list(agent_roles.keys())[0] if agent_roles else None,
"voting_enabled": False,
"consensus_required": False
},
"created_at": datetime.utcnow().isoformat()
}
return {
"status": "created",
"team": team_config,
"agent_count": len(agent_roles)
}
except Exception as e:
return {
"status": "error",
"error": str(e)
}
async def coordinate_workflow(
workflow: WorkflowConfig,
input_data: Dict[str, Any]
) -> Dict[str, Any]:
"""Coordinates multi-agent workflow execution.
Args:
workflow: Workflow configuration
input_data: Input data for the workflow
Returns:
Workflow execution results
"""
try:
results = []
if workflow.pattern == "sequential":
# Execute agents in sequence
current_input = input_data
for agent in workflow.agents:
invocation = AgentInvocation(
agent_name=agent,
input_data=current_input
)
result = await invoke_agent(invocation)
results.append(result)
# Pass output to next agent
if result["status"] == "success":
current_input = result.get("result", {}).get("output", current_input)
elif workflow.error_strategy == "fail_fast":
break
elif workflow.pattern == "parallel":
# Execute agents in parallel
tasks = [
invoke_agent(AgentInvocation(agent_name=agent, input_data=input_data))
for agent in workflow.agents
]
results = await asyncio.gather(*tasks, return_exceptions=True)
elif workflow.pattern == "loop":
# Execute agents in a loop
iteration = 0
current_input = input_data
while iteration < workflow.max_iterations:
for agent in workflow.agents:
invocation = AgentInvocation(
agent_name=agent,
input_data=current_input
)
result = await invoke_agent(invocation)
results.append(result)
# Check loop condition (simplified)
if result.get("result", {}).get("complete", False):
iteration = workflow.max_iterations
break
iteration += 1
return {
"status": "completed",
"pattern": workflow.pattern,
"results": results,
"agents_invoked": workflow.agents,
"execution_time_ms": 5678, # Would be actual timing
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
return {
"status": "error",
"pattern": workflow.pattern,
"error": str(e)
}