Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:25:09 +08:00
commit 9475095985
30 changed files with 5609 additions and 0 deletions

View File

@@ -0,0 +1,187 @@
/**
* Basic Realtime Voice Agent
*
* Demonstrates:
* - Creating a realtime voice agent
* - Defining tools for voice agents
* - Configuring voice and instructions
* - Understanding WebRTC vs WebSocket transports
*
* NOTE: This runs in the browser or in a Node.js environment with WebRTC support
*/
import { z } from 'zod';
import { RealtimeAgent, tool } from '@openai/agents-realtime';
// ========================================
// Tools for Voice Agent
// ========================================
// Note: Tools for realtime agents execute in the client environment
// For sensitive operations, make HTTP requests to your backend
const checkWeatherTool = tool({
name: 'check_weather',
description: 'Check current weather for a city',
parameters: z.object({
city: z.string().describe('City name'),
units: z.enum(['celsius', 'fahrenheit']).optional().default('celsius'),
}),
execute: async ({ city, units }) => {
// In production, call a real weather API
const temp = Math.floor(Math.random() * 30) + 10;
return `The weather in ${city} is sunny and ${temp}°${units === 'celsius' ? 'C' : 'F'}`;
},
});
const setReminderTool = tool({
name: 'set_reminder',
description: 'Set a reminder for the user',
parameters: z.object({
message: z.string(),
timeMinutes: z.number().describe('Minutes from now'),
}),
execute: async ({ message, timeMinutes }) => {
// In production, save to database via API call
console.log(`Reminder set: "${message}" in ${timeMinutes} minutes`);
return `I'll remind you about "${message}" in ${timeMinutes} minutes`;
},
});
const searchDocsTool = tool({
name: 'search_docs',
description: 'Search documentation',
parameters: z.object({
query: z.string(),
}),
execute: async ({ query }) => {
// In production, call your search API
return `Found documentation about: ${query}`;
},
});
// ========================================
// Create Realtime Voice Agent
// ========================================
const voiceAssistant = new RealtimeAgent({
name: 'Voice Assistant',
// Instructions for the agent's behavior
instructions: `You are a friendly and helpful voice assistant.
- Keep responses concise and conversational
- Use natural speech patterns
- When using tools, explain what you're doing
- Be proactive in offering help`,
// Tools available to the agent
tools: [checkWeatherTool, setReminderTool, searchDocsTool],
// Voice configuration (OpenAI voice options)
voice: 'alloy', // Options: alloy, echo, fable, onyx, nova, shimmer
// Model (realtime API uses specific models)
model: 'gpt-4o-realtime-preview', // Default for realtime
// Turn detection (when to consider user done speaking)
turnDetection: {
type: 'server_vad', // Voice Activity Detection on server
threshold: 0.5, // Sensitivity (0-1)
prefix_padding_ms: 300, // Audio before speech starts
silence_duration_ms: 500, // Silence to end turn
},
// Additional configuration
temperature: 0.7, // Response creativity (0-1)
maxOutputTokens: 4096, // Maximum response length
});
// ========================================
// Example: Create Session (Node.js)
// ========================================
/**
* For Node.js environments, you need to manually manage the session.
* See realtime-session-browser.tsx for browser usage.
*/
async function createNodeSession() {
// Note: WebRTC transport requires browser environment
// For Node.js, use WebSocket transport
const { OpenAIRealtimeWebSocket } = await import('@openai/agents-realtime');
const transport = new OpenAIRealtimeWebSocket({
apiKey: process.env.OPENAI_API_KEY,
});
// Create session
const session = await voiceAssistant.createSession({
transport,
});
// Handle events
session.on('connected', () => {
console.log('✅ Voice session connected');
});
session.on('disconnected', () => {
console.log('🔌 Voice session disconnected');
});
session.on('error', (error) => {
console.error('❌ Session error:', error);
});
// Audio transcription events
session.on('audio.transcription.completed', (event) => {
console.log('User said:', event.transcript);
});
session.on('agent.audio.done', (event) => {
console.log('Agent said:', event.transcript);
});
// Tool call events
session.on('tool.call', (event) => {
console.log('Tool called:', event.name, event.arguments);
});
session.on('tool.result', (event) => {
console.log('Tool result:', event.result);
});
// Connect to start session
await session.connect();
// To disconnect later
// await session.disconnect();
return session;
}
// ========================================
// Transport Options
// ========================================
/**
* WebRTC Transport (recommended for browser)
* - Lower latency
* - Better for real-time voice
* - Requires browser environment
*
* WebSocket Transport
* - Works in Node.js
* - Slightly higher latency
* - Simpler setup
*/
// Uncomment to run in Node.js
// createNodeSession().catch(console.error);
export {
voiceAssistant,
checkWeatherTool,
setReminderTool,
searchDocsTool,
createNodeSession,
};

View File

@@ -0,0 +1,215 @@
/**
* Realtime Agent Handoffs (Voice)
*
* Demonstrates:
* - Multi-agent voice workflows
* - Handoffs between voice agents
* - Automatic conversation history passing
* - Voice/model constraints during handoffs
*
* IMPORTANT: Unlike text agents, realtime agent handoffs have constraints:
* - Cannot change voice during handoff
* - Cannot change model during handoff
* - Conversation history automatically passed
*/
import { z } from 'zod';
import { RealtimeAgent, tool } from '@openai/agents-realtime';
// ========================================
// Specialized Agent Tools
// ========================================
const checkAccountTool = tool({
name: 'check_account',
description: 'Look up account information',
parameters: z.object({
accountId: z.string(),
}),
execute: async ({ accountId }) => {
return `Account ${accountId}: Premium tier, billing current, last login: 2025-10-20`;
},
});
const processPaymentTool = tool({
name: 'process_payment',
description: 'Process a payment',
parameters: z.object({
accountId: z.string(),
amount: z.number(),
}),
execute: async ({ accountId, amount }) => {
return `Payment of $${amount} processed for account ${accountId}`;
},
});
const checkSystemTool = tool({
name: 'check_system',
description: 'Check system status',
parameters: z.object({}),
execute: async () => {
return 'All systems operational: API ✅, Database ✅, CDN ✅';
},
});
const createTicketTool = tool({
name: 'create_ticket',
description: 'Create support ticket',
parameters: z.object({
title: z.string(),
priority: z.enum(['low', 'medium', 'high']),
}),
execute: async ({ title, priority }) => {
const id = `TICKET-${Math.floor(Math.random() * 10000)}`;
return `Created ${priority} priority ticket ${id}: ${title}`;
},
});
// ========================================
// Specialized Voice Agents
// ========================================
const billingAgent = new RealtimeAgent({
name: 'Billing Specialist',
instructions: `You handle billing and payment questions.
- Be professional and empathetic
- Explain charges clearly
- Process payments when requested
- Keep responses concise for voice`,
handoffDescription: 'Transfer for billing, payments, or account questions',
tools: [checkAccountTool, processPaymentTool],
voice: 'nova', // All agents must use same voice as parent
});
const technicalAgent = new RealtimeAgent({
name: 'Technical Support',
instructions: `You handle technical issues and system problems.
- Diagnose issues systematically
- Provide clear troubleshooting steps
- Create tickets for complex issues
- Use simple language for voice`,
handoffDescription: 'Transfer for technical problems, bugs, or system issues',
tools: [checkSystemTool, createTicketTool],
voice: 'nova', // Must match triage agent voice
});
// ========================================
// Triage Agent (Entry Point)
// ========================================
const triageVoiceAgent = new RealtimeAgent({
name: 'Customer Service',
instructions: `You are the first point of contact.
- Greet customers warmly
- Understand their issue
- Route to the right specialist
- Explain the transfer before handing off`,
handoffs: [billingAgent, technicalAgent],
voice: 'nova', // This voice will be used by all agents
model: 'gpt-4o-realtime-preview', // This model will be used by all agents
});
// ========================================
// Important Notes about Voice Handoffs
// ========================================
/**
* KEY DIFFERENCES from text agent handoffs:
*
* 1. VOICE CONSTRAINT
* - All agents in a handoff chain must use the same voice
* - Voice is set by the initial agent
* - Cannot change voice during handoff
*
* 2. MODEL CONSTRAINT
* - All agents must use the same model
* - Model is set by the initial agent
* - Cannot change model during handoff
*
* 3. AUTOMATIC HISTORY
* - Conversation history automatically passed to delegated agent
* - No need to manually manage context
* - Specialist agents can see full conversation
*
* 4. SEAMLESS AUDIO
* - Audio stream continues during handoff
* - User doesn't need to reconnect
* - Tools execute in same session
*/
// ========================================
// Example: Create Session with Handoffs
// ========================================
async function createVoiceSessionWithHandoffs() {
const { OpenAIRealtimeWebSocket } = await import('@openai/agents-realtime');
const transport = new OpenAIRealtimeWebSocket({
apiKey: process.env.OPENAI_API_KEY,
});
const session = await triageVoiceAgent.createSession({
transport,
});
// Track which agent is currently active
let currentAgent = 'Customer Service';
session.on('connected', () => {
console.log('✅ Voice session connected');
console.log('🎙️ Current agent:', currentAgent);
});
// Listen for agent changes (handoffs)
session.on('agent.changed', (event: any) => {
currentAgent = event.agent.name;
console.log('\n🔄 HANDOFF to:', currentAgent);
});
session.on('audio.transcription.completed', (event) => {
console.log(`👤 User: ${event.transcript}`);
});
session.on('agent.audio.done', (event) => {
console.log(`🤖 ${currentAgent}: ${event.transcript}`);
});
session.on('tool.call', (event) => {
console.log(`\n🛠 Tool: ${event.name}`);
console.log(` Arguments:`, event.arguments);
});
session.on('tool.result', (event) => {
console.log(`✅ Result:`, event.result, '\n');
});
await session.connect();
console.log('\n💡 Try saying:');
console.log(' - "I have a question about my bill"');
console.log(' - "The API is returning errors"');
console.log(' - "I need to update my payment method"');
console.log('\n');
return session;
}
// ========================================
// Example: Manual Handoff Triggering
// ========================================
/**
* While handoffs usually happen automatically via LLM routing,
* you can also programmatically trigger them if needed via
* backend delegation patterns (see agent-patterns.md reference).
*/
// Uncomment to run
// createVoiceSessionWithHandoffs().catch(console.error);
export {
triageVoiceAgent,
billingAgent,
technicalAgent,
createVoiceSessionWithHandoffs,
};

View File

@@ -0,0 +1,369 @@
/**
* Realtime Voice Session - React Browser Client
*
* Demonstrates:
* - Creating a voice session in the browser
* - Using WebRTC transport for low latency
* - Handling audio I/O automatically
* - Managing session lifecycle
* - Displaying transcripts and tool calls
*
* IMPORTANT: Generate ephemeral API keys server-side, never expose your main API key
*/
import React, { useState, useEffect, useRef } from 'react';
import { RealtimeSession, RealtimeAgent } from '@openai/agents-realtime';
import { z } from 'zod';
// ========================================
// Voice Agent Definition
// ========================================
import { tool } from '@openai/agents-realtime';
const weatherTool = tool({
name: 'get_weather',
description: 'Get weather for a city',
parameters: z.object({
city: z.string(),
}),
execute: async ({ city }) => {
// Call your backend API
const response = await fetch(`/api/weather?city=${city}`);
const data = await response.json();
return data.weather;
},
});
const voiceAgent = new RealtimeAgent({
name: 'Voice Assistant',
instructions: 'You are a helpful voice assistant. Keep responses concise and friendly.',
tools: [weatherTool],
voice: 'alloy',
});
// ========================================
// React Component
// ========================================
interface Message {
role: 'user' | 'assistant';
content: string;
timestamp: Date;
}
interface ToolCall {
name: string;
arguments: Record<string, any>;
result?: any;
}
export function VoiceAssistant() {
const [isConnected, setIsConnected] = useState(false);
const [isListening, setIsListening] = useState(false);
const [messages, setMessages] = useState<Message[]>([]);
const [toolCalls, setToolCalls] = useState<ToolCall[]>([]);
const [error, setError] = useState<string | null>(null);
const sessionRef = useRef<RealtimeSession | null>(null);
// ========================================
// Initialize Session
// ========================================
useEffect(() => {
let session: RealtimeSession;
async function initSession() {
try {
// Get ephemeral API key from your backend
const response = await fetch('/api/generate-session-key');
const { apiKey } = await response.json();
// Create session with WebRTC transport (low latency)
session = new RealtimeSession(voiceAgent, {
apiKey,
transport: 'webrtc', // or 'websocket'
});
sessionRef.current = session;
// ========================================
// Session Event Handlers
// ========================================
session.on('connected', () => {
console.log('✅ Connected to voice session');
setIsConnected(true);
setError(null);
});
session.on('disconnected', () => {
console.log('🔌 Disconnected from voice session');
setIsConnected(false);
setIsListening(false);
});
session.on('error', (err) => {
console.error('❌ Session error:', err);
setError(err.message);
});
// ========================================
// Transcription Events
// ========================================
session.on('audio.transcription.completed', (event) => {
// User finished speaking
setMessages(prev => [...prev, {
role: 'user',
content: event.transcript,
timestamp: new Date(),
}]);
setIsListening(false);
});
session.on('audio.transcription.started', () => {
// User started speaking
setIsListening(true);
});
session.on('agent.audio.done', (event) => {
// Agent finished speaking
setMessages(prev => [...prev, {
role: 'assistant',
content: event.transcript,
timestamp: new Date(),
}]);
});
// ========================================
// Tool Call Events
// ========================================
session.on('tool.call', (event) => {
console.log('🛠️ Tool call:', event.name, event.arguments);
setToolCalls(prev => [...prev, {
name: event.name,
arguments: event.arguments,
}]);
});
session.on('tool.result', (event) => {
console.log('✅ Tool result:', event.result);
setToolCalls(prev => prev.map(tc =>
tc.name === event.name
? { ...tc, result: event.result }
: tc
));
});
// Connect to start session
await session.connect();
} catch (err: any) {
console.error('Failed to initialize session:', err);
setError(err.message);
}
}
initSession();
// Cleanup on unmount
return () => {
if (session) {
session.disconnect();
}
};
}, []);
// ========================================
// Manual Control Functions
// ========================================
const handleInterrupt = () => {
if (sessionRef.current) {
sessionRef.current.interrupt();
}
};
const handleDisconnect = () => {
if (sessionRef.current) {
sessionRef.current.disconnect();
}
};
// ========================================
// Render UI
// ========================================
return (
<div className="voice-assistant">
<div className="status-bar">
<div className={`status ${isConnected ? 'connected' : 'disconnected'}`}>
{isConnected ? '🟢 Connected' : '🔴 Disconnected'}
</div>
{isListening && <div className="listening">🎤 Listening...</div>}
</div>
{error && (
<div className="error">
Error: {error}
</div>
)}
<div className="messages">
{messages.map((msg, i) => (
<div key={i} className={`message ${msg.role}`}>
<div className="role">{msg.role === 'user' ? '👤' : '🤖'}</div>
<div className="content">
<p>{msg.content}</p>
<span className="timestamp">
{msg.timestamp.toLocaleTimeString()}
</span>
</div>
</div>
))}
</div>
{toolCalls.length > 0 && (
<div className="tool-calls">
<h3>🛠 Tool Calls</h3>
{toolCalls.map((tc, i) => (
<div key={i} className="tool-call">
<strong>{tc.name}</strong>
<pre>{JSON.stringify(tc.arguments, null, 2)}</pre>
{tc.result && (
<div className="result">
Result: {JSON.stringify(tc.result)}
</div>
)}
</div>
))}
</div>
)}
<div className="controls">
<button
onClick={handleInterrupt}
disabled={!isConnected}
>
Interrupt
</button>
<button
onClick={handleDisconnect}
disabled={!isConnected}
>
🔌 Disconnect
</button>
</div>
<style jsx>{`
.voice-assistant {
max-width: 600px;
margin: 0 auto;
padding: 20px;
}
.status-bar {
display: flex;
gap: 20px;
margin-bottom: 20px;
}
.status {
padding: 8px 16px;
border-radius: 20px;
font-size: 14px;
}
.status.connected {
background: #d4edda;
color: #155724;
}
.status.disconnected {
background: #f8d7da;
color: #721c24;
}
.listening {
padding: 8px 16px;
background: #fff3cd;
color: #856404;
border-radius: 20px;
font-size: 14px;
}
.error {
padding: 12px;
background: #f8d7da;
color: #721c24;
border-radius: 8px;
margin-bottom: 20px;
}
.messages {
height: 400px;
overflow-y: auto;
border: 1px solid #ddd;
border-radius: 8px;
padding: 16px;
margin-bottom: 20px;
}
.message {
display: flex;
gap: 12px;
margin-bottom: 16px;
}
.message.user {
justify-content: flex-end;
}
.content {
max-width: 70%;
padding: 12px;
border-radius: 12px;
}
.message.user .content {
background: #007bff;
color: white;
}
.message.assistant .content {
background: #f1f3f4;
color: #000;
}
.timestamp {
font-size: 11px;
opacity: 0.6;
}
.tool-calls {
margin-bottom: 20px;
padding: 12px;
background: #f8f9fa;
border-radius: 8px;
}
.tool-call {
margin: 8px 0;
padding: 8px;
background: white;
border-radius: 4px;
}
.controls {
display: flex;
gap: 12px;
}
button {
flex: 1;
padding: 12px;
border: none;
border-radius: 8px;
background: #007bff;
color: white;
cursor: pointer;
}
button:disabled {
background: #ccc;
cursor: not-allowed;
}
button:hover:not(:disabled) {
background: #0056b3;
}
`}</style>
</div>
);
}
export default VoiceAssistant;