Voice Cloning & TTS: 5 Engines Compared (OpenAI, ElevenLabs, Coqui, Google, Azure)

Five TTS engines, five different tradeoffs. Here’s a quick rundown:

Quick Comparison Matrix

| Engine | Latency | Quality | Voice Clone | Cost/1K chars | API Key | |---|---|---|---|---|---| | OpenAI TTS | ~800ms | ⭐⭐⭐⭐ | ❌ | $0.015 | Required | | ElevenLabs | ~400ms | ⭐⭐⭐⭐⭐ | ✅ | $0.30 (Starter) | Required | | Coqui TTS | ~300ms (local) | ⭐⭐⭐⭐ | ✅ | Free | None | | Google TTS | ~200ms | ⭐⭐⭐ | ❌ | $0.004 (WaveNet) | Required | | Azure TTS | ~250ms | ⭐⭐⭐⭐ | ✅ | $0.016 | Required |

Setup

pip install openai elevenlabs TTS google-cloud-texttospeech azure-cognitiveservices-speech pydub

Engine 1: OpenAI TTS

OpenAI's API is simple to use and gives natural speech. Six voices included.

from openai import OpenAI
import time

client = OpenAI(api_key="your-key-here")  # Or set OPENAI_API_KEY env var

def tts_openai(
    text: str,
    voice: str = "nova",       # alloy, echo, fable, onyx, nova, shimmer
    model: str = "tts-1",      # tts-1 (fast) or tts-1-hd (higher quality)
    output_path: str = "output.mp3",
    speed: float = 1.0         # 0.25 to 4.0
) -> dict:
    """OpenAI text-to-speech. No voice cloning, but high quality out of the box."""
    
    start = time.time()
    
    response = client.audio.speech.create(
        model=model,
        voice=voice,
        input=text,
        speed=speed
    )
    
    response.stream_to_file(output_path)
    latency = time.time() - start
    
    return {
        "engine": "openai",
        "voice": voice,
        "model": model,
        "latency_s": round(latency, 2),
        "chars": len(text),
        "cost_usd": len(text) / 1000 * 0.015 if model == "tts-1" else len(text) / 1000 * 0.030
    }

result = tts_openai(
    "The quick brown fox jumped over the lazy dog.",
    voice="nova",
    model="tts-1-hd"
)
print(result)
# {'engine': 'openai', 'voice': 'nova', 'latency_s': 0.82, 'chars': 45, 'cost_usd': 0.00135}

Engine 2: ElevenLabs (Best Quality + Voice Cloning)

ElevenLabs is the best for voice quality and supports real-time voice cloning.

from elevenlabs.client import ElevenLabs
from elevenlabs import VoiceSettings
import time

client_el = ElevenLabs(api_key="your-elevenlabs-key")

def tts_elevenlabs(
    text: str,
    voice_id: str = "21m00Tcm4TlvDq8ikWAM",  # Rachel - default
    stability: float = 0.5,
    similarity_boost: float = 0.75,
    style: float = 0.0,
    output_path: str = "output.mp3"
) -> dict:
    """ElevenLabs TTS — highest quality, supports voice cloning."""
    
    start = time.time()
    
    audio = client_el.generate(
        text=text,
        voice=voice_id,
        model="eleven_multilingual_v2",  # or eleven_turbo_v2 for speed
        voice_settings=VoiceSettings(
            stability=stability,
            similarity_boost=similarity_boost,
            style=style,
            use_speaker_boost=True
        )
    )
    
    with open(output_path, "wb") as f:
        for chunk in audio:
            f.write(chunk)
    
    latency = time.time() - start
    return {"engine": "elevenlabs", "latency_s": round(latency, 2), "chars": len(text)}

# Clone a voice from a sample audio file
def clone_voice_elevenlabs(
    name: str,
    sample_files: list,  # List of .mp3/.wav paths (30s+ recommended)
    description: str = ""
) -> str:
    """Clone a voice from audio samples. Returns voice_id."""
    
    files = [open(f, "rb") for f in sample_files]
    
    voice = client_el.clone(
        name=name,
        description=description,
        files=files
    )
    
    for f in files:
        f.close()
    
    print(f"Voice cloned: {voice.voice_id}")
    return voice.voice_id

# Clone your voice
# voice_id = clone_voice_elevenlabs("MyVoice", ["sample1.mp3", "sample2.mp3"])
# tts_elevenlabs("Hello world!", voice_id=voice_id)

Engine 3: Coqui TTS (Local, Free, Voice Cloning)

Coqui runs locally and supports voice cloning with a short reference audio.

from TTS.api import TTS
import torch
import time

def tts_coqui(
    text: str,
    model_name: str = "tts_models/en/ljspeech/tacotron2-DDC",
    speaker_wav: str = None,     # Path to reference audio for voice cloning
    output_path: str = "output.wav",
    language: str = "en"
) -> dict:
    """
    Coqui TTS - runs locally, no API key needed.
    Use speaker_wav for voice cloning (XTTS model required).
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # For voice cloning, use XTTS-v2
    if speaker_wav:
        model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
    
    tts = TTS(model_name=model_name, progress_bar=False).to(device)
    
    start = time.time()
    
    if speaker_wav:
        # Voice cloning: condition on reference audio
        tts.tts_to_file(
            text=text,
            file_path=output_path,
            speaker_wav=speaker_wav,
            language=language
        )
    else:
        tts.tts_to_file(text=text, file_path=output_path)
    
    latency = time.time() - start
    return {
        "engine": "coqui",
        "model": model_name,
        "latency_s": round(latency, 2),
        "cost": 0.0,
        "device": device
    }

# Standard TTS
result = tts_coqui("This runs entirely on your GPU. No API key needed.")
print(result)

# Voice cloning from a 30-second sample
result = tts_coqui(
    "Hello, this is my cloned voice.",
    speaker_wav="my_voice_sample.wav",
    output_path="cloned_output.wav"
)

Engine 4: Google Cloud TTS

Google's WaveNet and Neural2 voices are cheaper and support many languages.

from google.cloud import texttospeech
import time

def tts_google(
    text: str,
    language_code: str = "en-US",
    voice_name: str = "en-US-Neural2-F",  # Neural2 voices are best quality
    speaking_rate: float = 1.0,
    pitch: float = 0.0,
    output_path: str = "output.mp3"
) -> dict:
    """
    Google Cloud TTS. Set GOOGLE_APPLICATION_CREDENTIALS env var.
    Neural2 voices cost $0.016/1K chars; Standard is $0.004/1K.
    """
    client = texttospeech.TextToSpeechClient()
    
    synthesis_input = texttospeech.SynthesisInput(text=text)
    
    voice = texttospeech.VoiceSelectionParams(
        language_code=language_code,
        name=voice_name
    )
    
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
        speaking_rate=speaking_rate,
        pitch=pitch,
        effects_profile_id=["headphone-class-device"]
    )
    
    start = time.time()
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )
    latency = time.time() - start
    
    with open(output_path, "wb") as f:
        f.write(response.audio_content)
    
    return {"engine": "google", "voice": voice_name, "latency_s": round(latency, 2),
            "cost_usd": len(text) / 1000 * 0.016}

# List available voices
def list_google_voices(language_code: str = "en-US"):
    client = texttospeech.TextToSpeechClient()
    voices = client.list_voices(language_code=language_code)
    for v in voices.voices:
        print(f"{v.name} | {v.ssml_gender.name} | {', '.join(v.language_codes)}")

Engine 5: Azure Cognitive Services TTS

Azure's neural voices are high quality and support SSML.

import azure.cognitiveservices.speech as speechsdk
import time

def tts_azure(
    text: str,
    voice_name: str = "en-US-JennyNeural",
    output_path: str = "output.wav",
    subscription_key: str = None,
    region: str = "eastus"
) -> dict:
    """
    Azure Cognitive Services TTS.
    Set AZURE_SPEECH_KEY and AZURE_SPEECH_REGION env vars.
    """
    import os
    key = subscription_key or os.getenv("AZURE_SPEECH_KEY")
    region = region or os.getenv("AZURE_SPEECH_REGION", "eastus")
    
    config = speechsdk.SpeechConfig(subscription=key, region=region)
    config.speech_synthesis_voice_name = voice_name
    
    audio_config = speechsdk.audio.AudioOutputConfig(filename=output_path)
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=audio_config)
    
    start = time.time()
    result = synthesizer.speak_text_async(text).get()
    latency = time.time() - start
    
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        return {"engine": "azure", "voice": voice_name, "latency_s": round(latency, 2), "success": True}
    else:
        return {"engine": "azure", "error": str(result.cancellation_details.error_details)}

# SSML for fine-grained control
def tts_azure_ssml(ssml: str, output_path: str):
    """Use SSML for pronunciation, pauses, and emphasis."""
    # Example SSML:
    example_ssml = """
    <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
      <voice name="en-US-JennyNeural">
        Welcome to <emphasis level="strong">NEPA AI</emphasis>.
        <break time="500ms"/>
        Your <prosody rate="slow">intelligent workspace</prosody> awaits.
      </voice>
    </speak>"""

Choosing the Right Engine

def choose_tts_engine(use_case: str) -> str:
    """Decision logic for engine selection."""
    use_case = use_case.lower()
    
    if "clone" in use_case or "custom voice" in use_case:
        return "elevenlabs (best) or coqui (free local)"
    
    if "free" in use_case or "no api" in use_case or "local" in use_case:
        return "coqui - runs on your GPU, zero cost"
    
    if "bulk" in use_case or "cheap" in use_case or "batch" in use_case:
        return "google (cheapest at $0.004/1K) or openai tts-1"
    
    if "podcast" in use_case or "video" in use_case or "quality" in use_case:
        return "elevenlabs (best quality) or azure neural"
    
    if "multilingual" in use_case or "international" in use_case:
        return "google (40+ languages) or azure (45+ languages)"
    
    return "openai tts-1-hd (best balance of quality, cost, simplicity)"

The TTS Workspace comes with all 5 engines pre-configured, SSML support, voice cloning workflows, batch narration for long-form content, and audio post-processing (normalization, noise reduction, silence trimming) — 898 lines ready to drop into any project.

→ Get TTS Workspace on the Shop

Voice Cloning & TTS: 5 Engines Compared (OpenAI, ElevenLabs, Coqui, Google, Azure)

Quick Comparison Matrix

Setup

Engine 1: OpenAI TTS

Engine 2: ElevenLabs (Best Quality + Voice Cloning)

Engine 3: Coqui TTS (Local, Free, Voice Cloning)

Engine 4: Google Cloud TTS

Engine 5: Azure Cognitive Services TTS

Choosing the Right Engine

How to Separate Stems from Any Song with AI (Demucs Deep Dive)

AI Frame Interpolation with RIFE: Smooth Any Video to 60fps

AI Image Upscaling: Real-ESRGAN vs Lanczos — Real Benchmarks

Quick Comparison Matrix

Setup

Engine 1: OpenAI TTS

Engine 2: ElevenLabs (Best Quality + Voice Cloning)

Engine 3: Coqui TTS (Local, Free, Voice Cloning)

Engine 4: Google Cloud TTS

Engine 5: Azure Cognitive Services TTS

Choosing the Right Engine

Related Reads

How to Separate Stems from Any Song with AI (Demucs Deep Dive)

AI Frame Interpolation with RIFE: Smooth Any Video to 60fps

AI Image Upscaling: Real-ESRGAN vs Lanczos — Real Benchmarks