Back to Blog
2026-03-22

Voice Cloning & TTS: 5 Engines Compared (OpenAI, ElevenLabs, Coqui, Google, Azure)

Hands-on comparison of OpenAI TTS, ElevenLabs, Coqui AI (local), Google Cloud TTS, and Azure Cognitive Services. Latency, quality, cost, and Python code for each.

Five TTS engines. Five completely different tradeoffs. This comparison covers real API calls, latency benchmarks, cost per word, and voice cloning capabilities — so you can pick the right engine for the right task.

Quick Comparison Matrix

| Engine | Latency | Quality | Voice Clone | Cost/1K chars | API Key | |---|---|---|---|---|---| | OpenAI TTS | ~800ms | ⭐⭐⭐⭐ | ❌ | $0.015 | Required | | ElevenLabs | ~400ms | ⭐⭐⭐⭐⭐ | ✅ | $0.30 (Starter) | Required | | Coqui TTS | ~300ms (local) | ⭐⭐⭐⭐ | ✅ | Free | None | | Google TTS | ~200ms | ⭐⭐⭐ | ❌ | $0.004 (WaveNet) | Required | | Azure TTS | ~250ms | ⭐⭐⭐⭐ | ✅ | $0.016 | Required |

Setup

pip install openai elevenlabs TTS google-cloud-texttospeech azure-cognitiveservices-speech pydub

Engine 1: OpenAI TTS

OpenAI's TTS API is the easiest to integrate and produces natural-sounding speech. Six voices: alloy, echo, fable, onyx, nova, shimmer.

from openai import OpenAI
import time

client = OpenAI(api_key="your-key-here")  # Or set OPENAI_API_KEY env var

def tts_openai(
    text: str,
    voice: str = "nova",       # alloy, echo, fable, onyx, nova, shimmer
    model: str = "tts-1",      # tts-1 (fast) or tts-1-hd (higher quality)
    output_path: str = "output.mp3",
    speed: float = 1.0         # 0.25 to 4.0
) -> dict:
    """OpenAI text-to-speech. No voice cloning, but high quality out of the box."""
    
    start = time.time()
    
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=text,
        speed=speed
    )
    
    response.stream_to_file(output_path)
    latency = time.time() - start
    
    return {
        "engine": "openai",
        "voice": voice,
        "model": model,
        "latency_s": round(latency, 2),
        "chars": len(text),
        "cost_usd": len(text) / 1000 * 0.015 if model == "tts-1" else len(text) / 1000 * 0.030
    }

result = tts_openai(
    "The quick brown fox jumped over the lazy dog.",
    voice="nova",
    model="tts-1-hd"
)
print(result)
# {'engine': 'openai', 'voice': 'nova', 'latency_s': 0.82, 'chars': 45, 'cost_usd': 0.00135}

Engine 2: ElevenLabs (Best Quality + Voice Cloning)

ElevenLabs is the gold standard for voice quality and the only API with reliable real-time voice cloning from a short audio sample.

from elevenlabs.client import ElevenLabs
from elevenlabs import VoiceSettings
import time

client_el = ElevenLabs(api_key="your-elevenlabs-key")

def tts_elevenlabs(
    text: str,
    voice_id: str = "21m00Tcm4TlvDq8ikWAM",  # Rachel - default
    stability: float = 0.5,
    similarity_boost: float = 0.75,
    style: float = 0.0,
    output_path: str = "output.mp3"
) -> dict:
    """ElevenLabs TTS — highest quality, supports voice cloning."""
    
    start = time.time()
    
    audio = client_el.generate(
        text=text,
        voice=voice_id,
        model="eleven_multilingual_v2",  # or eleven_turbo_v2 for speed
        voice_settings=VoiceSettings(
            stability=stability,
            similarity_boost=similarity_boost,
            style=style,
            use_speaker_boost=True
        )
    )
    
    with open(output_path, "wb") as f:
        for chunk in audio:
            f.write(chunk)
    
    latency = time.time() - start
    return {"engine": "elevenlabs", "latency_s": round(latency, 2), "chars": len(text)}

# Clone a voice from a sample audio file
def clone_voice_elevenlabs(
    name: str,
    sample_files: list,  # List of .mp3/.wav paths (30s+ recommended)
    description: str = ""
) -> str:
    """Clone a voice from audio samples. Returns voice_id."""
    
    files = [open(f, "rb") for f in sample_files]
    
    voice = client_el.clone(
        name=name,
        description=description,
        files=files
    )
    
    for f in files:
        f.close()
    
    print(f"Voice cloned: {voice.voice_id}")
    return voice.voice_id

# Clone your voice
# voice_id = clone_voice_elevenlabs("MyVoice", ["sample1.mp3", "sample2.mp3"])
# tts_elevenlabs("Hello world!", voice_id=voice_id)

Engine 3: Coqui TTS (Local, Free, Voice Cloning)

Coqui runs entirely on your machine — no API key, no usage costs, and it supports voice cloning from a short reference audio.

from TTS.api import TTS
import torch
import time

def tts_coqui(
    text: str,
    model_name: str = "tts_models/en/ljspeech/tacotron2-DDC",
    speaker_wav: str = None,     # Path to reference audio for voice cloning
    output_path: str = "output.wav",
    language: str = "en"
) -> dict:
    """
    Coqui TTS - runs locally, no API key needed.
    Use speaker_wav for voice cloning (XTTS model required).
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # For voice cloning, use XTTS-v2
    if speaker_wav:
        model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
    
    tts = TTS(model_name=model_name, progress_bar=False).to(device)
    
    start = time.time()
    
    if speaker_wav:
        # Voice cloning: condition on reference audio
        tts.tts_to_file(
            text=text,
            file_path=output_path,
            speaker_wav=speaker_wav,
            language=language
        )
    else:
        tts.tts_to_file(text=text, file_path=output_path)
    
    latency = time.time() - start
    return {
        "engine": "coqui",
        "model": model_name,
        "latency_s": round(latency, 2),
        "cost": 0.0,
        "device": device
    }

# Standard TTS
result = tts_coqui("This runs entirely on your GPU. No API key needed.")
print(result)

# Voice cloning from a 30-second sample
result = tts_coqui(
    "Hello, this is my cloned voice.",
    speaker_wav="my_voice_sample.wav",
    output_path="cloned_output.wav"
)

Engine 4: Google Cloud TTS

Google's WaveNet and Neural2 voices are cheaper than OpenAI/ElevenLabs and support 40+ languages.

from google.cloud import texttospeech
import time

def tts_google(
    text: str,
    language_code: str = "en-US",
    voice_name: str = "en-US-Neural2-F",  # Neural2 voices are best quality
    speaking_rate: float = 1.0,
    pitch: float = 0.0,
    output_path: str = "output.mp3"
) -> dict:
    """
    Google Cloud TTS. Set GOOGLE_APPLICATION_CREDENTIALS env var.
    Neural2 voices cost $0.016/1K chars; Standard is $0.004/1K.
    """
    client = texttospeech.TextToSpeechClient()
    
    synthesis_input = texttospeech.SynthesisInput(text=text)
    
    voice = texttospeech.VoiceSelectionParams(
        language_code=language_code,
        name=voice_name
    )
    
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
        speaking_rate=speaking_rate,
        pitch=pitch,
        effects_profile_id=["headphone-class-device"]
    )
    
    start = time.time()
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )
    latency = time.time() - start
    
    with open(output_path, "wb") as f:
        f.write(response.audio_content)
    
    return {"engine": "google", "voice": voice_name, "latency_s": round(latency, 2),
            "cost_usd": len(text) / 1000 * 0.016}

# List available voices
def list_google_voices(language_code: str = "en-US"):
    client = texttospeech.TextToSpeechClient()
    voices = client.list_voices(language_code=language_code)
    for v in voices.voices:
        print(f"{v.name} | {v.ssml_gender.name} | {', '.join(v.language_codes)}")

Engine 5: Azure Cognitive Services TTS

Azure's neural voices are competitive with ElevenLabs for quality and support SSML for fine-grained control.

import azure.cognitiveservices.speech as speechsdk
import time

def tts_azure(
    text: str,
    voice_name: str = "en-US-JennyNeural",
    output_path: str = "output.wav",
    subscription_key: str = None,
    region: str = "eastus"
) -> dict:
    """
    Azure Cognitive Services TTS.
    Set AZURE_SPEECH_KEY and AZURE_SPEECH_REGION env vars.
    """
    import os
    key = subscription_key or os.getenv("AZURE_SPEECH_KEY")
    region = region or os.getenv("AZURE_SPEECH_REGION", "eastus")
    
    config = speechsdk.SpeechConfig(subscription=key, region=region)
    config.speech_synthesis_voice_name = voice_name
    
    audio_config = speechsdk.audio.AudioOutputConfig(filename=output_path)
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=audio_config)
    
    start = time.time()
    result = synthesizer.speak_text_async(text).get()
    latency = time.time() - start
    
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        return {"engine": "azure", "voice": voice_name, "latency_s": round(latency, 2), "success": True}
    else:
        return {"engine": "azure", "error": str(result.cancellation_details.error_details)}

# SSML for fine-grained control
def tts_azure_ssml(ssml: str, output_path: str):
    """Use SSML for pronunciation, pauses, and emphasis."""
    # Example SSML:
    example_ssml = """
    <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
      <voice name="en-US-JennyNeural">
        Welcome to <emphasis level="strong">NEPA AI</emphasis>.
        <break time="500ms"/>
        Your <prosody rate="slow">intelligent workspace</prosody> awaits.
      </voice>
    </speak>"""

Choosing the Right Engine

def choose_tts_engine(use_case: str) -> str:
    """Decision logic for engine selection."""
    use_case = use_case.lower()
    
    if "clone" in use_case or "custom voice" in use_case:
        return "elevenlabs (best) or coqui (free local)"
    
    if "free" in use_case or "no api" in use_case or "local" in use_case:
        return "coqui - runs on your GPU, zero cost"
    
    if "bulk" in use_case or "cheap" in use_case or "batch" in use_case:
        return "google (cheapest at $0.004/1K) or openai tts-1"
    
    if "podcast" in use_case or "video" in use_case or "quality" in use_case:
        return "elevenlabs (best quality) or azure neural"
    
    if "multilingual" in use_case or "international" in use_case:
        return "google (40+ languages) or azure (45+ languages)"
    
    return "openai tts-1-hd (best balance of quality, cost, simplicity)"

The TTS Workspace ships with all 5 engines pre-configured, SSML support, voice cloning workflows, batch narration for long-form content, and audio post-processing (normalization, noise reduction, silence trimming) — 898 lines ready to drop into any project.

→ Get TTS Workspace on the Shop