Files
gh-jezweb-claude-skills-ski…/templates/text-to-speech.ts
2025-11-30 08:25:12 +08:00

373 lines
11 KiB
TypeScript

/**
* OpenAI Audio API - Text-to-Speech Examples
*
* This template demonstrates:
* - Basic TTS with all 11 voices
* - Different models (tts-1, tts-1-hd, gpt-4o-mini-tts)
* - Voice instructions (gpt-4o-mini-tts only)
* - Speed control
* - Different audio formats
* - Streaming TTS
*/
import OpenAI from 'openai';
import fs from 'fs';
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
// =============================================================================
// BASIC TTS
// =============================================================================
async function basicTTS() {
const mp3 = await openai.audio.speech.create({
model: 'tts-1',
voice: 'alloy',
input: 'The quick brown fox jumped over the lazy dog.',
});
const buffer = Buffer.from(await mp3.arrayBuffer());
fs.writeFileSync('speech.mp3', buffer);
console.log('Speech saved to: speech.mp3');
}
// =============================================================================
// ALL 11 VOICES
// =============================================================================
async function allVoices() {
const voices = [
'alloy', // Neutral, balanced
'ash', // Clear, professional
'ballad', // Warm, storytelling
'coral', // Soft, friendly
'echo', // Calm, measured
'fable', // Expressive, narrative
'onyx', // Deep, authoritative
'nova', // Bright, energetic
'sage', // Wise, thoughtful
'shimmer', // Gentle, soothing
'verse', // Poetic, rhythmic
] as const;
const text = 'Hello, this is a voice sample.';
for (const voice of voices) {
console.log(`Generating ${voice} voice...`);
const mp3 = await openai.audio.speech.create({
model: 'tts-1',
voice,
input: text,
});
const buffer = Buffer.from(await mp3.arrayBuffer());
fs.writeFileSync(`speech-${voice}.mp3`, buffer);
// Wait 500ms between requests
await new Promise(resolve => setTimeout(resolve, 500));
}
console.log('All voice samples generated!');
}
// =============================================================================
// MODEL COMPARISON
// =============================================================================
async function modelComparison() {
const text = 'This is a test of different TTS models.';
// tts-1 (standard quality, fastest)
console.log('Generating with tts-1...');
const tts1 = await openai.audio.speech.create({
model: 'tts-1',
voice: 'nova',
input: text,
});
const buffer1 = Buffer.from(await tts1.arrayBuffer());
fs.writeFileSync('tts-1-output.mp3', buffer1);
// tts-1-hd (high quality)
console.log('Generating with tts-1-hd...');
const tts1Hd = await openai.audio.speech.create({
model: 'tts-1-hd',
voice: 'nova',
input: text,
});
const buffer2 = Buffer.from(await tts1Hd.arrayBuffer());
fs.writeFileSync('tts-1-hd-output.mp3', buffer2);
console.log('Model comparison complete!');
console.log('tts-1 file size:', buffer1.length, 'bytes');
console.log('tts-1-hd file size:', buffer2.length, 'bytes');
}
// =============================================================================
// VOICE INSTRUCTIONS (gpt-4o-mini-tts)
// =============================================================================
async function voiceInstructions() {
// Example 1: Calm and professional
const professional = await openai.audio.speech.create({
model: 'gpt-4o-mini-tts',
voice: 'nova',
input: 'Welcome to our customer support line. How can I help you today?',
instructions: 'Speak in a calm, professional, and friendly tone suitable for customer service.',
});
const buffer1 = Buffer.from(await professional.arrayBuffer());
fs.writeFileSync('professional-tone.mp3', buffer1);
// Example 2: Energetic and enthusiastic
const energetic = await openai.audio.speech.create({
model: 'gpt-4o-mini-tts',
voice: 'nova',
input: 'Get ready for the biggest sale of the year! Don\'t miss out!',
instructions: 'Use an enthusiastic, energetic tone perfect for marketing and advertisements.',
});
const buffer2 = Buffer.from(await energetic.arrayBuffer());
fs.writeFileSync('energetic-tone.mp3', buffer2);
// Example 3: Calm and soothing
const soothing = await openai.audio.speech.create({
model: 'gpt-4o-mini-tts',
voice: 'shimmer',
input: 'Take a deep breath. Relax your shoulders. Let all tension fade away.',
instructions: 'Adopt a calm, soothing voice suitable for meditation and relaxation guidance.',
});
const buffer3 = Buffer.from(await soothing.arrayBuffer());
fs.writeFileSync('soothing-tone.mp3', buffer3);
console.log('Voice instruction examples generated!');
}
// =============================================================================
// SPEED CONTROL
// =============================================================================
async function speedControl() {
const text = 'This sentence will be spoken at different speeds.';
const speeds = [0.5, 0.75, 1.0, 1.25, 1.5, 2.0];
for (const speed of speeds) {
console.log(`Generating at ${speed}x speed...`);
const mp3 = await openai.audio.speech.create({
model: 'tts-1',
voice: 'alloy',
input: text,
speed,
});
const buffer = Buffer.from(await mp3.arrayBuffer());
fs.writeFileSync(`speech-${speed}x.mp3`, buffer);
await new Promise(resolve => setTimeout(resolve, 500));
}
console.log('Speed variations generated!');
}
// =============================================================================
// DIFFERENT AUDIO FORMATS
// =============================================================================
async function differentFormats() {
const text = 'Testing different audio formats.';
const formats = ['mp3', 'opus', 'aac', 'flac', 'wav', 'pcm'] as const;
for (const format of formats) {
console.log(`Generating ${format} format...`);
const audio = await openai.audio.speech.create({
model: 'tts-1',
voice: 'alloy',
input: text,
response_format: format,
});
const buffer = Buffer.from(await audio.arrayBuffer());
const extension = format === 'pcm' ? 'raw' : format;
fs.writeFileSync(`speech.${extension}`, buffer);
console.log(` ${format}: ${buffer.length} bytes`);
await new Promise(resolve => setTimeout(resolve, 500));
}
console.log('All format examples generated!');
}
// =============================================================================
// LONG TEXT HANDLING
// =============================================================================
async function longText() {
const longText = `
This is a longer piece of text that demonstrates how TTS handles extended content.
The model can process up to 4096 characters in a single request.
You can use this for narrating articles, generating audiobooks, or creating voice-overs.
The speech will maintain natural pacing and intonation throughout.
`.trim();
const mp3 = await openai.audio.speech.create({
model: 'tts-1-hd',
voice: 'fable', // Good for narration
input: longText,
});
const buffer = Buffer.from(await mp3.arrayBuffer());
fs.writeFileSync('long-narration.mp3', buffer);
console.log('Long narration generated!');
console.log('Text length:', longText.length, 'characters');
console.log('Audio size:', buffer.length, 'bytes');
}
// =============================================================================
// STREAMING TTS (Server-Sent Events)
// =============================================================================
async function streamingTTS() {
const response = await fetch('https://api.openai.com/v1/audio/speech', {
method: 'POST',
headers: {
'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: 'gpt-4o-mini-tts',
voice: 'nova',
input: 'This is a streaming audio example. The audio will be generated and delivered in chunks.',
stream_format: 'sse', // Server-Sent Events
}),
});
console.log('Streaming TTS...');
const reader = response.body?.getReader();
const chunks: Uint8Array[] = [];
while (true) {
const { done, value } = await reader!.read();
if (done) break;
chunks.push(value);
console.log('Received chunk:', value.length, 'bytes');
}
// Combine chunks
const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0);
const combined = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
combined.set(chunk, offset);
offset += chunk.length;
}
fs.writeFileSync('streaming-output.mp3', Buffer.from(combined));
console.log('Streaming TTS saved to: streaming-output.mp3');
}
// =============================================================================
// ERROR HANDLING
// =============================================================================
async function withErrorHandling() {
try {
const mp3 = await openai.audio.speech.create({
model: 'tts-1',
voice: 'alloy',
input: 'Hello world',
});
const buffer = Buffer.from(await mp3.arrayBuffer());
fs.writeFileSync('output.mp3', buffer);
return 'output.mp3';
} catch (error: any) {
if (error.message.includes('input too long')) {
console.error('Text exceeds 4096 character limit');
} else if (error.message.includes('invalid voice')) {
console.error('Voice not recognized - use one of the 11 supported voices');
} else if (error.status === 429) {
console.error('Rate limit exceeded - wait and retry');
} else {
console.error('TTS error:', error.message);
}
throw error;
}
}
// =============================================================================
// MAIN EXECUTION
// =============================================================================
async function main() {
console.log('=== OpenAI Text-to-Speech Examples ===\n');
// Example 1: Basic TTS
console.log('1. Basic TTS:');
await basicTTS();
console.log();
// Example 2: All voices (uncomment to generate all)
// console.log('2. All 11 Voices:');
// await allVoices();
// console.log();
// Example 3: Model comparison
console.log('3. Model Comparison:');
await modelComparison();
console.log();
// Example 4: Voice instructions
console.log('4. Voice Instructions (gpt-4o-mini-tts):');
await voiceInstructions();
console.log();
// Example 5: Speed control
console.log('5. Speed Control:');
await speedControl();
console.log();
// Example 6: Different formats
console.log('6. Different Audio Formats:');
await differentFormats();
console.log();
// Example 7: Long text
console.log('7. Long Text Narration:');
await longText();
console.log();
}
// Run if executed directly
if (require.main === module) {
main().catch(console.error);
}
export {
basicTTS,
allVoices,
modelComparison,
voiceInstructions,
speedControl,
differentFormats,
longText,
streamingTTS,
withErrorHandling,
};