Five TTS engines, five different tradeoffs. Here’s a quick rundown:
Quick Comparison Matrix
| Engine | Latency | Quality | Voice Clone | Cost/1K chars | API Key | |---|---|---|---|---|---| | OpenAI TTS | ~800ms | ⭐⭐⭐⭐ | ❌ | $0.015 | Required | | ElevenLabs | ~400ms | ⭐⭐⭐⭐⭐ | ✅ | $0.30 (Starter) | Required | | Coqui TTS | ~300ms (local) | ⭐⭐⭐⭐ | ✅ | Free | None | | Google TTS | ~200ms | ⭐⭐⭐ | ❌ | $0.004 (WaveNet) | Required | | Azure TTS | ~250ms | ⭐⭐⭐⭐ | ✅ | $0.016 | Required |
Setup
pip install openai elevenlabs TTS google-cloud-texttospeech azure-cognitiveservices-speech pydub
Engine 1: OpenAI TTS
OpenAI's API is simple to use and gives natural speech. Six voices included.
from openai import OpenAI
import time
client = OpenAI(api_key="your-key-here") # Or set OPENAI_API_KEY env var
def tts_openai(
text: str,
voice: str = "nova", # alloy, echo, fable, onyx, nova, shimmer
model: str = "tts-1", # tts-1 (fast) or tts-1-hd (higher quality)
output_path: str = "output.mp3",
speed: float = 1.0 # 0.25 to 4.0
) -> dict:
"""OpenAI text-to-speech. No voice cloning, but high quality out of the box."""
start = time.time()
response = client.audio.speech.create(
model=model,
voice=voice,
input=text,
speed=speed
)
response.stream_to_file(output_path)
latency = time.time() - start
return {
"engine": "openai",
"voice": voice,
"model": model,
"latency_s": round(latency, 2),
"chars": len(text),
"cost_usd": len(text) / 1000 * 0.015 if model == "tts-1" else len(text) / 1000 * 0.030
}
result = tts_openai(
"The quick brown fox jumped over the lazy dog.",
voice="nova",
model="tts-1-hd"
)
print(result)
# {'engine': 'openai', 'voice': 'nova', 'latency_s': 0.82, 'chars': 45, 'cost_usd': 0.00135}
Engine 2: ElevenLabs (Best Quality + Voice Cloning)
ElevenLabs is the best for voice quality and supports real-time voice cloning.
from elevenlabs.client import ElevenLabs
from elevenlabs import VoiceSettings
import time
client_el = ElevenLabs(api_key="your-elevenlabs-key")
def tts_elevenlabs(
text: str,
voice_id: str = "21m00Tcm4TlvDq8ikWAM", # Rachel - default
stability: float = 0.5,
similarity_boost: float = 0.75,
style: float = 0.0,
output_path: str = "output.mp3"
) -> dict:
"""ElevenLabs TTS — highest quality, supports voice cloning."""
start = time.time()
audio = client_el.generate(
text=text,
voice=voice_id,
model="eleven_multilingual_v2", # or eleven_turbo_v2 for speed
voice_settings=VoiceSettings(
stability=stability,
similarity_boost=similarity_boost,
style=style,
use_speaker_boost=True
)
)
with open(output_path, "wb") as f:
for chunk in audio:
f.write(chunk)
latency = time.time() - start
return {"engine": "elevenlabs", "latency_s": round(latency, 2), "chars": len(text)}
# Clone a voice from a sample audio file
def clone_voice_elevenlabs(
name: str,
sample_files: list, # List of .mp3/.wav paths (30s+ recommended)
description: str = ""
) -> str:
"""Clone a voice from audio samples. Returns voice_id."""
files = [open(f, "rb") for f in sample_files]
voice = client_el.clone(
name=name,
description=description,
files=files
)
for f in files:
f.close()
print(f"Voice cloned: {voice.voice_id}")
return voice.voice_id
# Clone your voice
# voice_id = clone_voice_elevenlabs("MyVoice", ["sample1.mp3", "sample2.mp3"])
# tts_elevenlabs("Hello world!", voice_id=voice_id)
Engine 3: Coqui TTS (Local, Free, Voice Cloning)
Coqui runs locally and supports voice cloning with a short reference audio.
from TTS.api import TTS
import torch
import time
def tts_coqui(
text: str,
model_name: str = "tts_models/en/ljspeech/tacotron2-DDC",
speaker_wav: str = None, # Path to reference audio for voice cloning
output_path: str = "output.wav",
language: str = "en"
) -> dict:
"""
Coqui TTS - runs locally, no API key needed.
Use speaker_wav for voice cloning (XTTS model required).
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
# For voice cloning, use XTTS-v2
if speaker_wav:
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name=model_name, progress_bar=False).to(device)
start = time.time()
if speaker_wav:
# Voice cloning: condition on reference audio
tts.tts_to_file(
text=text,
file_path=output_path,
speaker_wav=speaker_wav,
language=language
)
else:
tts.tts_to_file(text=text, file_path=output_path)
latency = time.time() - start
return {
"engine": "coqui",
"model": model_name,
"latency_s": round(latency, 2),
"cost": 0.0,
"device": device
}
# Standard TTS
result = tts_coqui("This runs entirely on your GPU. No API key needed.")
print(result)
# Voice cloning from a 30-second sample
result = tts_coqui(
"Hello, this is my cloned voice.",
speaker_wav="my_voice_sample.wav",
output_path="cloned_output.wav"
)
Engine 4: Google Cloud TTS
Google's WaveNet and Neural2 voices are cheaper and support many languages.
from google.cloud import texttospeech
import time
def tts_google(
text: str,
language_code: str = "en-US",
voice_name: str = "en-US-Neural2-F", # Neural2 voices are best quality
speaking_rate: float = 1.0,
pitch: float = 0.0,
output_path: str = "output.mp3"
) -> dict:
"""
Google Cloud TTS. Set GOOGLE_APPLICATION_CREDENTIALS env var.
Neural2 voices cost $0.016/1K chars; Standard is $0.004/1K.
"""
client = texttospeech.TextToSpeechClient()
synthesis_input = texttospeech.SynthesisInput(text=text)
voice = texttospeech.VoiceSelectionParams(
language_code=language_code,
name=voice_name
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=speaking_rate,
pitch=pitch,
effects_profile_id=["headphone-class-device"]
)
start = time.time()
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
latency = time.time() - start
with open(output_path, "wb") as f:
f.write(response.audio_content)
return {"engine": "google", "voice": voice_name, "latency_s": round(latency, 2),
"cost_usd": len(text) / 1000 * 0.016}
# List available voices
def list_google_voices(language_code: str = "en-US"):
client = texttospeech.TextToSpeechClient()
voices = client.list_voices(language_code=language_code)
for v in voices.voices:
print(f"{v.name} | {v.ssml_gender.name} | {', '.join(v.language_codes)}")
Engine 5: Azure Cognitive Services TTS
Azure's neural voices are high quality and support SSML.
import azure.cognitiveservices.speech as speechsdk
import time
def tts_azure(
text: str,
voice_name: str = "en-US-JennyNeural",
output_path: str = "output.wav",
subscription_key: str = None,
region: str = "eastus"
) -> dict:
"""
Azure Cognitive Services TTS.
Set AZURE_SPEECH_KEY and AZURE_SPEECH_REGION env vars.
"""
import os
key = subscription_key or os.getenv("AZURE_SPEECH_KEY")
region = region or os.getenv("AZURE_SPEECH_REGION", "eastus")
config = speechsdk.SpeechConfig(subscription=key, region=region)
config.speech_synthesis_voice_name = voice_name
audio_config = speechsdk.audio.AudioOutputConfig(filename=output_path)
synthesizer = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=audio_config)
start = time.time()
result = synthesizer.speak_text_async(text).get()
latency = time.time() - start
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
return {"engine": "azure", "voice": voice_name, "latency_s": round(latency, 2), "success": True}
else:
return {"engine": "azure", "error": str(result.cancellation_details.error_details)}
# SSML for fine-grained control
def tts_azure_ssml(ssml: str, output_path: str):
"""Use SSML for pronunciation, pauses, and emphasis."""
# Example SSML:
example_ssml = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="en-US-JennyNeural">
Welcome to <emphasis level="strong">NEPA AI</emphasis>.
<break time="500ms"/>
Your <prosody rate="slow">intelligent workspace</prosody> awaits.
</voice>
</speak>"""
Choosing the Right Engine
def choose_tts_engine(use_case: str) -> str:
"""Decision logic for engine selection."""
use_case = use_case.lower()
if "clone" in use_case or "custom voice" in use_case:
return "elevenlabs (best) or coqui (free local)"
if "free" in use_case or "no api" in use_case or "local" in use_case:
return "coqui - runs on your GPU, zero cost"
if "bulk" in use_case or "cheap" in use_case or "batch" in use_case:
return "google (cheapest at $0.004/1K) or openai tts-1"
if "podcast" in use_case or "video" in use_case or "quality" in use_case:
return "elevenlabs (best quality) or azure neural"
if "multilingual" in use_case or "international" in use_case:
return "google (40+ languages) or azure (45+ languages)"
return "openai tts-1-hd (best balance of quality, cost, simplicity)"
The TTS Workspace comes with all 5 engines pre-configured, SSML support, voice cloning workflows, batch narration for long-form content, and audio post-processing (normalization, noise reduction, silence trimming) — 898 lines ready to drop into any project.



