Voice Cloning & TTS: 5 Engines Compared (OpenAI, ElevenLabs, Coqui, Google, Azure)
Hands-on comparison of OpenAI TTS, ElevenLabs, Coqui AI (local), Google Cloud TTS, and Azure Cognitive Services. Latency, quality, cost, and Python code for each.
Five TTS engines. Five completely different tradeoffs. This comparison covers real API calls, latency benchmarks, cost per word, and voice cloning capabilities — so you can pick the right engine for the right task.
Quick Comparison Matrix
| Engine | Latency | Quality | Voice Clone | Cost/1K chars | API Key | |---|---|---|---|---|---| | OpenAI TTS | ~800ms | ⭐⭐⭐⭐ | ❌ | $0.015 | Required | | ElevenLabs | ~400ms | ⭐⭐⭐⭐⭐ | ✅ | $0.30 (Starter) | Required | | Coqui TTS | ~300ms (local) | ⭐⭐⭐⭐ | ✅ | Free | None | | Google TTS | ~200ms | ⭐⭐⭐ | ❌ | $0.004 (WaveNet) | Required | | Azure TTS | ~250ms | ⭐⭐⭐⭐ | ✅ | $0.016 | Required |
Setup
pip install openai elevenlabs TTS google-cloud-texttospeech azure-cognitiveservices-speech pydub
Engine 1: OpenAI TTS
OpenAI's TTS API is the easiest to integrate and produces natural-sounding speech. Six voices: alloy, echo, fable, onyx, nova, shimmer.
from openai import OpenAI
import time
client = OpenAI(api_key="your-key-here") # Or set OPENAI_API_KEY env var
def tts_openai(
text: str,
voice: str = "nova", # alloy, echo, fable, onyx, nova, shimmer
model: str = "tts-1", # tts-1 (fast) or tts-1-hd (higher quality)
output_path: str = "output.mp3",
speed: float = 1.0 # 0.25 to 4.0
) -> dict:
"""OpenAI text-to-speech. No voice cloning, but high quality out of the box."""
start = time.time()
response = client.audio.speech.create(
model=model,
voice=voice,
input=text,
speed=speed
)
response.stream_to_file(output_path)
latency = time.time() - start
return {
"engine": "openai",
"voice": voice,
"model": model,
"latency_s": round(latency, 2),
"chars": len(text),
"cost_usd": len(text) / 1000 * 0.015 if model == "tts-1" else len(text) / 1000 * 0.030
}
result = tts_openai(
"The quick brown fox jumped over the lazy dog.",
voice="nova",
model="tts-1-hd"
)
print(result)
# {'engine': 'openai', 'voice': 'nova', 'latency_s': 0.82, 'chars': 45, 'cost_usd': 0.00135}
Engine 2: ElevenLabs (Best Quality + Voice Cloning)
ElevenLabs is the gold standard for voice quality and the only API with reliable real-time voice cloning from a short audio sample.
from elevenlabs.client import ElevenLabs
from elevenlabs import VoiceSettings
import time
client_el = ElevenLabs(api_key="your-elevenlabs-key")
def tts_elevenlabs(
text: str,
voice_id: str = "21m00Tcm4TlvDq8ikWAM", # Rachel - default
stability: float = 0.5,
similarity_boost: float = 0.75,
style: float = 0.0,
output_path: str = "output.mp3"
) -> dict:
"""ElevenLabs TTS — highest quality, supports voice cloning."""
start = time.time()
audio = client_el.generate(
text=text,
voice=voice_id,
model="eleven_multilingual_v2", # or eleven_turbo_v2 for speed
voice_settings=VoiceSettings(
stability=stability,
similarity_boost=similarity_boost,
style=style,
use_speaker_boost=True
)
)
with open(output_path, "wb") as f:
for chunk in audio:
f.write(chunk)
latency = time.time() - start
return {"engine": "elevenlabs", "latency_s": round(latency, 2), "chars": len(text)}
# Clone a voice from a sample audio file
def clone_voice_elevenlabs(
name: str,
sample_files: list, # List of .mp3/.wav paths (30s+ recommended)
description: str = ""
) -> str:
"""Clone a voice from audio samples. Returns voice_id."""
files = [open(f, "rb") for f in sample_files]
voice = client_el.clone(
name=name,
description=description,
files=files
)
for f in files:
f.close()
print(f"Voice cloned: {voice.voice_id}")
return voice.voice_id
# Clone your voice
# voice_id = clone_voice_elevenlabs("MyVoice", ["sample1.mp3", "sample2.mp3"])
# tts_elevenlabs("Hello world!", voice_id=voice_id)
Engine 3: Coqui TTS (Local, Free, Voice Cloning)
Coqui runs entirely on your machine — no API key, no usage costs, and it supports voice cloning from a short reference audio.
from TTS.api import TTS
import torch
import time
def tts_coqui(
text: str,
model_name: str = "tts_models/en/ljspeech/tacotron2-DDC",
speaker_wav: str = None, # Path to reference audio for voice cloning
output_path: str = "output.wav",
language: str = "en"
) -> dict:
"""
Coqui TTS - runs locally, no API key needed.
Use speaker_wav for voice cloning (XTTS model required).
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
# For voice cloning, use XTTS-v2
if speaker_wav:
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
tts = TTS(model_name=model_name, progress_bar=False).to(device)
start = time.time()
if speaker_wav:
# Voice cloning: condition on reference audio
tts.tts_to_file(
text=text,
file_path=output_path,
speaker_wav=speaker_wav,
language=language
)
else:
tts.tts_to_file(text=text, file_path=output_path)
latency = time.time() - start
return {
"engine": "coqui",
"model": model_name,
"latency_s": round(latency, 2),
"cost": 0.0,
"device": device
}
# Standard TTS
result = tts_coqui("This runs entirely on your GPU. No API key needed.")
print(result)
# Voice cloning from a 30-second sample
result = tts_coqui(
"Hello, this is my cloned voice.",
speaker_wav="my_voice_sample.wav",
output_path="cloned_output.wav"
)
Engine 4: Google Cloud TTS
Google's WaveNet and Neural2 voices are cheaper than OpenAI/ElevenLabs and support 40+ languages.
from google.cloud import texttospeech
import time
def tts_google(
text: str,
language_code: str = "en-US",
voice_name: str = "en-US-Neural2-F", # Neural2 voices are best quality
speaking_rate: float = 1.0,
pitch: float = 0.0,
output_path: str = "output.mp3"
) -> dict:
"""
Google Cloud TTS. Set GOOGLE_APPLICATION_CREDENTIALS env var.
Neural2 voices cost $0.016/1K chars; Standard is $0.004/1K.
"""
client = texttospeech.TextToSpeechClient()
synthesis_input = texttospeech.SynthesisInput(text=text)
voice = texttospeech.VoiceSelectionParams(
language_code=language_code,
name=voice_name
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=speaking_rate,
pitch=pitch,
effects_profile_id=["headphone-class-device"]
)
start = time.time()
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
latency = time.time() - start
with open(output_path, "wb") as f:
f.write(response.audio_content)
return {"engine": "google", "voice": voice_name, "latency_s": round(latency, 2),
"cost_usd": len(text) / 1000 * 0.016}
# List available voices
def list_google_voices(language_code: str = "en-US"):
client = texttospeech.TextToSpeechClient()
voices = client.list_voices(language_code=language_code)
for v in voices.voices:
print(f"{v.name} | {v.ssml_gender.name} | {', '.join(v.language_codes)}")
Engine 5: Azure Cognitive Services TTS
Azure's neural voices are competitive with ElevenLabs for quality and support SSML for fine-grained control.
import azure.cognitiveservices.speech as speechsdk
import time
def tts_azure(
text: str,
voice_name: str = "en-US-JennyNeural",
output_path: str = "output.wav",
subscription_key: str = None,
region: str = "eastus"
) -> dict:
"""
Azure Cognitive Services TTS.
Set AZURE_SPEECH_KEY and AZURE_SPEECH_REGION env vars.
"""
import os
key = subscription_key or os.getenv("AZURE_SPEECH_KEY")
region = region or os.getenv("AZURE_SPEECH_REGION", "eastus")
config = speechsdk.SpeechConfig(subscription=key, region=region)
config.speech_synthesis_voice_name = voice_name
audio_config = speechsdk.audio.AudioOutputConfig(filename=output_path)
synthesizer = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=audio_config)
start = time.time()
result = synthesizer.speak_text_async(text).get()
latency = time.time() - start
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
return {"engine": "azure", "voice": voice_name, "latency_s": round(latency, 2), "success": True}
else:
return {"engine": "azure", "error": str(result.cancellation_details.error_details)}
# SSML for fine-grained control
def tts_azure_ssml(ssml: str, output_path: str):
"""Use SSML for pronunciation, pauses, and emphasis."""
# Example SSML:
example_ssml = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
<voice name="en-US-JennyNeural">
Welcome to <emphasis level="strong">NEPA AI</emphasis>.
<break time="500ms"/>
Your <prosody rate="slow">intelligent workspace</prosody> awaits.
</voice>
</speak>"""
Choosing the Right Engine
def choose_tts_engine(use_case: str) -> str:
"""Decision logic for engine selection."""
use_case = use_case.lower()
if "clone" in use_case or "custom voice" in use_case:
return "elevenlabs (best) or coqui (free local)"
if "free" in use_case or "no api" in use_case or "local" in use_case:
return "coqui - runs on your GPU, zero cost"
if "bulk" in use_case or "cheap" in use_case or "batch" in use_case:
return "google (cheapest at $0.004/1K) or openai tts-1"
if "podcast" in use_case or "video" in use_case or "quality" in use_case:
return "elevenlabs (best quality) or azure neural"
if "multilingual" in use_case or "international" in use_case:
return "google (40+ languages) or azure (45+ languages)"
return "openai tts-1-hd (best balance of quality, cost, simplicity)"
The TTS Workspace ships with all 5 engines pre-configured, SSML support, voice cloning workflows, batch narration for long-form content, and audio post-processing (normalization, noise reduction, silence trimming) — 898 lines ready to drop into any project.