OpenAI Text-to-Speech
Generate high-quality spoken audio from text using OpenAI's TTS API.
Authentication
The API key is available as environment variable:
OPENAI_API_KEY
Models
gpt-4o-mini-tts- Newest, most reliable. Supports tone/style instructions.tts-1- Lower latency, lower qualitytts-1-hd- Higher quality, higher latency
Voice Options
Built-in voices (English optimized):
alloy,ash,ballad,coral,echo,fablenova,onyx,sage,shimmer,versemarin,cedar- Recommended for best quality
Note: tts-1 and tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer.
Python Example
from pathlib import Path
from openai import OpenAI
client = OpenAI() # Uses OPENAI_API_KEY env var
# Basic usage
with client.audio.speech.with_streaming_response.create(
model="gpt-4o-mini-tts",
voice="coral",
input="Hello, world!",
) as response:
response.stream_to_file("output.mp3")
# With tone instructions (gpt-4o-mini-tts only)
with client.audio.speech.with_streaming_response.create(
model="gpt-4o-mini-tts",
voice="coral",
input="Today is a wonderful day!",
instructions="Speak in a cheerful and positive tone.",
) as response:
response.stream_to_file("output.mp3")
Handling Long Text
For long documents, split into chunks and concatenate:
from openai import OpenAI
from pydub import AudioSegment
import tempfile
import re
import os
client = OpenAI()
def chunk_text(text, max_chars=4000):
"""Split text into chunks at sentence boundaries."""
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_chars:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def text_to_audiobook(text, output_path):
"""Convert long text to audio file."""
chunks = chunk_text(text)
audio_segments = []
for chunk in chunks:
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
tmp_path = tmp.name
with client.audio.speech.with_streaming_response.create(
model="gpt-4o-mini-tts",
voice="coral",
input=chunk,
) as response:
response.stream_to_file(tmp_path)
segment = AudioSegment.from_mp3(tmp_path)
audio_segments.append(segment)
os.unlink(tmp_path)
# Concatenate all segments
combined = audio_segments[0]
for segment in audio_segments[1:]:
combined += segment
combined.export(output_path, format="mp3")
Output Formats
mp3- Default, general useopus- Low latency streamingaac- Digital compression (YouTube, iOS)flac- Lossless compressionwav- Uncompressed, low latencypcm- Raw samples (24kHz, 16-bit)
with client.audio.speech.with_streaming_response.create(
model="gpt-4o-mini-tts",
voice="coral",
input="Hello!",
response_format="wav", # Specify format
) as response:
response.stream_to_file("output.wav")
Best Practices
- Use
marinorcedarvoices for best quality - Split text at sentence boundaries for long content
- Use
wavorpcmfor lowest latency - Add
instructionsparameter to control tone/style (gpt-4o-mini-tts only)