gh-jezweb-claude-skills-ski…/templates/text-to-speech.ts

/**
 * OpenAI Audio API - Text-to-Speech Examples
 *
 * This template demonstrates:
 * - Basic TTS with all 11 voices
 * - Different models (tts-1, tts-1-hd, gpt-4o-mini-tts)
 * - Voice instructions (gpt-4o-mini-tts only)
 * - Speed control
 * - Different audio formats
 * - Streaming TTS
 */

import OpenAI from 'openai';
import fs from 'fs';

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY,
});

// =============================================================================
// BASIC TTS
// =============================================================================

async function basicTTS() {
  const mp3 = await openai.audio.speech.create({
    model: 'tts-1',
    voice: 'alloy',
    input: 'The quick brown fox jumped over the lazy dog.',
  });

  const buffer = Buffer.from(await mp3.arrayBuffer());
  fs.writeFileSync('speech.mp3', buffer);

  console.log('Speech saved to: speech.mp3');
}

// =============================================================================
// ALL 11 VOICES
// =============================================================================

async function allVoices() {
  const voices = [
    'alloy',   // Neutral, balanced
    'ash',     // Clear, professional
    'ballad',  // Warm, storytelling
    'coral',   // Soft, friendly
    'echo',    // Calm, measured
    'fable',   // Expressive, narrative
    'onyx',    // Deep, authoritative
    'nova',    // Bright, energetic
    'sage',    // Wise, thoughtful
    'shimmer', // Gentle, soothing
    'verse',   // Poetic, rhythmic
  ] as const;

  const text = 'Hello, this is a voice sample.';

  for (const voice of voices) {
    console.log(`Generating ${voice} voice...`);

    const mp3 = await openai.audio.speech.create({
      model: 'tts-1',
      voice,
      input: text,
    });

    const buffer = Buffer.from(await mp3.arrayBuffer());
    fs.writeFileSync(`speech-${voice}.mp3`, buffer);

    // Wait 500ms between requests
    await new Promise(resolve => setTimeout(resolve, 500));
  }

  console.log('All voice samples generated!');
}

// =============================================================================
// MODEL COMPARISON
// =============================================================================

async function modelComparison() {
  const text = 'This is a test of different TTS models.';

  // tts-1 (standard quality, fastest)
  console.log('Generating with tts-1...');
  const tts1 = await openai.audio.speech.create({
    model: 'tts-1',
    voice: 'nova',
    input: text,
  });

  const buffer1 = Buffer.from(await tts1.arrayBuffer());
  fs.writeFileSync('tts-1-output.mp3', buffer1);

  // tts-1-hd (high quality)
  console.log('Generating with tts-1-hd...');
  const tts1Hd = await openai.audio.speech.create({
    model: 'tts-1-hd',
    voice: 'nova',
    input: text,
  });

  const buffer2 = Buffer.from(await tts1Hd.arrayBuffer());
  fs.writeFileSync('tts-1-hd-output.mp3', buffer2);

  console.log('Model comparison complete!');
  console.log('tts-1 file size:', buffer1.length, 'bytes');
  console.log('tts-1-hd file size:', buffer2.length, 'bytes');
}

// =============================================================================
// VOICE INSTRUCTIONS (gpt-4o-mini-tts)
// =============================================================================

async function voiceInstructions() {
  // Example 1: Calm and professional
  const professional = await openai.audio.speech.create({
    model: 'gpt-4o-mini-tts',
    voice: 'nova',
    input: 'Welcome to our customer support line. How can I help you today?',
    instructions: 'Speak in a calm, professional, and friendly tone suitable for customer service.',
  });

  const buffer1 = Buffer.from(await professional.arrayBuffer());
  fs.writeFileSync('professional-tone.mp3', buffer1);

  // Example 2: Energetic and enthusiastic
  const energetic = await openai.audio.speech.create({
    model: 'gpt-4o-mini-tts',
    voice: 'nova',
    input: 'Get ready for the biggest sale of the year! Don\'t miss out!',
    instructions: 'Use an enthusiastic, energetic tone perfect for marketing and advertisements.',
  });

  const buffer2 = Buffer.from(await energetic.arrayBuffer());
  fs.writeFileSync('energetic-tone.mp3', buffer2);

  // Example 3: Calm and soothing
  const soothing = await openai.audio.speech.create({
    model: 'gpt-4o-mini-tts',
    voice: 'shimmer',
    input: 'Take a deep breath. Relax your shoulders. Let all tension fade away.',
    instructions: 'Adopt a calm, soothing voice suitable for meditation and relaxation guidance.',
  });

  const buffer3 = Buffer.from(await soothing.arrayBuffer());
  fs.writeFileSync('soothing-tone.mp3', buffer3);

  console.log('Voice instruction examples generated!');
}

// =============================================================================
// SPEED CONTROL
// =============================================================================

async function speedControl() {
  const text = 'This sentence will be spoken at different speeds.';

  const speeds = [0.5, 0.75, 1.0, 1.25, 1.5, 2.0];

  for (const speed of speeds) {
    console.log(`Generating at ${speed}x speed...`);

    const mp3 = await openai.audio.speech.create({
      model: 'tts-1',
      voice: 'alloy',
      input: text,
      speed,
    });

    const buffer = Buffer.from(await mp3.arrayBuffer());
    fs.writeFileSync(`speech-${speed}x.mp3`, buffer);

    await new Promise(resolve => setTimeout(resolve, 500));
  }

  console.log('Speed variations generated!');
}

// =============================================================================
// DIFFERENT AUDIO FORMATS
// =============================================================================

async function differentFormats() {
  const text = 'Testing different audio formats.';

  const formats = ['mp3', 'opus', 'aac', 'flac', 'wav', 'pcm'] as const;

  for (const format of formats) {
    console.log(`Generating ${format} format...`);

    const audio = await openai.audio.speech.create({
      model: 'tts-1',
      voice: 'alloy',
      input: text,
      response_format: format,
    });

    const buffer = Buffer.from(await audio.arrayBuffer());
    const extension = format === 'pcm' ? 'raw' : format;
    fs.writeFileSync(`speech.${extension}`, buffer);

    console.log(`  ${format}: ${buffer.length} bytes`);

    await new Promise(resolve => setTimeout(resolve, 500));
  }

  console.log('All format examples generated!');
}

// =============================================================================
// LONG TEXT HANDLING
// =============================================================================

async function longText() {
  const longText = `
    This is a longer piece of text that demonstrates how TTS handles extended content.
    The model can process up to 4096 characters in a single request.
    You can use this for narrating articles, generating audiobooks, or creating voice-overs.
    The speech will maintain natural pacing and intonation throughout.
  `.trim();

  const mp3 = await openai.audio.speech.create({
    model: 'tts-1-hd',
    voice: 'fable', // Good for narration
    input: longText,
  });

  const buffer = Buffer.from(await mp3.arrayBuffer());
  fs.writeFileSync('long-narration.mp3', buffer);

  console.log('Long narration generated!');
  console.log('Text length:', longText.length, 'characters');
  console.log('Audio size:', buffer.length, 'bytes');
}

// =============================================================================
// STREAMING TTS (Server-Sent Events)
// =============================================================================

async function streamingTTS() {
  const response = await fetch('https://api.openai.com/v1/audio/speech', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      model: 'gpt-4o-mini-tts',
      voice: 'nova',
      input: 'This is a streaming audio example. The audio will be generated and delivered in chunks.',
      stream_format: 'sse', // Server-Sent Events
    }),
  });

  console.log('Streaming TTS...');

  const reader = response.body?.getReader();
  const chunks: Uint8Array[] = [];

  while (true) {
    const { done, value } = await reader!.read();
    if (done) break;

    chunks.push(value);
    console.log('Received chunk:', value.length, 'bytes');
  }

  // Combine chunks
  const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0);
  const combined = new Uint8Array(totalLength);
  let offset = 0;

  for (const chunk of chunks) {
    combined.set(chunk, offset);
    offset += chunk.length;
  }

  fs.writeFileSync('streaming-output.mp3', Buffer.from(combined));
  console.log('Streaming TTS saved to: streaming-output.mp3');
}

// =============================================================================
// ERROR HANDLING
// =============================================================================

async function withErrorHandling() {
  try {
    const mp3 = await openai.audio.speech.create({
      model: 'tts-1',
      voice: 'alloy',
      input: 'Hello world',
    });

    const buffer = Buffer.from(await mp3.arrayBuffer());
    fs.writeFileSync('output.mp3', buffer);

    return 'output.mp3';
  } catch (error: any) {
    if (error.message.includes('input too long')) {
      console.error('Text exceeds 4096 character limit');
    } else if (error.message.includes('invalid voice')) {
      console.error('Voice not recognized - use one of the 11 supported voices');
    } else if (error.status === 429) {
      console.error('Rate limit exceeded - wait and retry');
    } else {
      console.error('TTS error:', error.message);
    }

    throw error;
  }
}

// =============================================================================
// MAIN EXECUTION
// =============================================================================

async function main() {
  console.log('=== OpenAI Text-to-Speech Examples ===\n');

  // Example 1: Basic TTS
  console.log('1. Basic TTS:');
  await basicTTS();
  console.log();

  // Example 2: All voices (uncomment to generate all)
  // console.log('2. All 11 Voices:');
  // await allVoices();
  // console.log();

  // Example 3: Model comparison
  console.log('3. Model Comparison:');
  await modelComparison();
  console.log();

  // Example 4: Voice instructions
  console.log('4. Voice Instructions (gpt-4o-mini-tts):');
  await voiceInstructions();
  console.log();

  // Example 5: Speed control
  console.log('5. Speed Control:');
  await speedControl();
  console.log();

  // Example 6: Different formats
  console.log('6. Different Audio Formats:');
  await differentFormats();
  console.log();

  // Example 7: Long text
  console.log('7. Long Text Narration:');
  await longText();
  console.log();
}

// Run if executed directly
if (require.main === module) {
  main().catch(console.error);
}

export {
  basicTTS,
  allVoices,
  modelComparison,
  voiceInstructions,
  speedControl,
  differentFormats,
  longText,
  streamingTTS,
  withErrorHandling,
};