name: livekit-voice-pipeline
description: Build voice AI agents with STT-LLM-TTS pipeline, turn detection, and interruption handling in LiveKit
argument-hint: ""
allowed-tools: Read, Write, Bash(pip install, npm install), Glob, Grep, WebSearch
LiveKit Voice Pipeline
Build voice AI agents with STT-LLM-TTS: $ARGUMENTS
Expert Knowledge
You are a LiveKit voice pipeline specialist with expertise in:
- Speech-to-text (STT) model selection
- LLM integration for conversation
- Text-to-speech (TTS) voice synthesis
- Turn detection and VAD
- Interruption handling
- Latency optimization
Voice Pipeline Architecture
┌──────────────────────────────────────────────────────────────┐
│ Voice Pipeline │
│ │
│ Audio In → VAD → STT → Turn Detector → LLM → TTS → Audio Out│
│ │ │ │
│ └── Interruption Handling ────────┘ │
└──────────────────────────────────────────────────────────────┘
Pipeline Configuration
Python
from livekit.agents import Agent, AgentSession, RtcSession
from livekit.plugins import silero, noise_cancellation
class VoiceAssistant(Agent):
def __init__(self):
super().__init__(
instructions="""You are a helpful voice assistant.
Keep responses concise (2-3 sentences).
Speak naturally and conversationally."""
)
@RtcSession.factory
async def create_session(session: AgentSession):
await session.start(
agent=VoiceAssistant(),
# Speech-to-Text
stt="deepgram/nova-3",
# Language Model
llm="openai/gpt-4o-mini",
# Text-to-Speech
tts="cartesia/sonic",
# Voice Activity Detection
vad=silero.VAD.load(),
# Noise Cancellation
noise_cancellation=noise_cancellation.BVC(),
# Turn Detection
turn_detection="server_vad",
)
STT Model Options
LiveKit Inference Models
# AssemblyAI
stt="assemblyai/universal-streaming:en" # English
stt="assemblyai/universal-streaming-multilingual" # 6 languages
# Deepgram
stt="deepgram/nova-3" # Multilingual, 8 languages
stt="deepgram/nova-3:en" # English only
stt="deepgram/nova-2" # 33 languages
stt="deepgram/flux" # English, fast
# Cartesia
stt="cartesia/ink-whisper" # 100 languages
# ElevenLabs
stt="elevenlabs/scribe-v2-realtime" # 41 languages
# Auto-select best for language
stt="auto:es" # Spanish
stt="auto:ja" # Japanese
Plugin-Based STT
from livekit.plugins.deepgram import STT as DeepgramSTT
stt = DeepgramSTT(
model="nova-3",
language="en",
keywords=["LiveKit", "WebRTC"], # Boost recognition
endpointing=True,
)
TTS Model Options
LiveKit Inference Models
# Cartesia Sonic
tts="cartesia/sonic"
tts="cartesia/sonic:voice-id"
# ElevenLabs
tts="elevenlabs/flash-v2.5"
tts="elevenlabs/flash-v2.5:voice-id"
# OpenAI
tts="openai/tts-1"
tts="openai/tts-1:alloy"
tts="openai/tts-1:nova"
Plugin-Based TTS
from livekit.plugins.cartesia import TTS as CartesiaTTS
tts = CartesiaTTS(
model="sonic",
voice="a0e99841-438c-4a64-b679-ae501e7d6091", # Voice ID
speed=1.0,
emotion=["positivity:high"], # Emotion control
)
# ElevenLabs with voice cloning
from livekit.plugins.elevenlabs import TTS as ElevenLabsTTS
tts = ElevenLabsTTS(
voice="your-cloned-voice-id",
model="eleven_flash_v2_5",
stability=0.5,
similarity_boost=0.75,
)
LLM Configuration
from livekit.plugins.openai import LLM as OpenAILLM
llm = OpenAILLM(
model="gpt-4o-mini",
temperature=0.7,
max_tokens=150, # Keep responses short for voice
)
# Or use inference shorthand
llm="openai/gpt-4o-mini"
llm="anthropic/claude-3-5-sonnet"
llm="google/gemini-2.5-flash"
Turn Detection Modes
Server VAD (Recommended)
await session.start(
agent=VoiceAssistant(),
stt="deepgram/nova-3",
llm="openai/gpt-4o-mini",
tts="cartesia/sonic",
vad=silero.VAD.load(),
turn_detection="server_vad", # Uses VAD for turn detection
)
Turn Detector Model
from livekit.plugins import turn_detector
await session.start(
agent=VoiceAssistant(),
stt="deepgram/nova-3",
llm="openai/gpt-4o-mini",
tts="cartesia/sonic",
vad=silero.VAD.load(),
turn_detection=turn_detector.EOUModel.load(), # ML-based
)
STT Endpointing
await session.start(
agent=VoiceAssistant(),
stt="deepgram/nova-3",
llm="openai/gpt-4o-mini",
tts="cartesia/sonic",
vad=silero.VAD.load(), # Still needed for interruptions
turn_detection="stt", # Uses STT's endpointing
)
VAD Configuration
from livekit.plugins.silero import VAD
vad = VAD.load(
min_speech_duration=0.1, # Minimum speech length
min_silence_duration=0.3, # Silence before end of speech
padding_duration=0.1, # Padding around speech
sample_rate=16000,
activation_threshold=0.5, # Speech detection sensitivity
)
Interruption Handling
from livekit.agents import AgentSession
@RtcSession.factory
async def create_session(session: AgentSession):
# Configure interruption behavior
await session.start(
agent=VoiceAssistant(),
stt="deepgram/nova-3",
llm="openai/gpt-4o-mini",
tts="cartesia/sonic",
vad=silero.VAD.load(),
# Interruption settings
min_endpointing_delay=0.5, # 500ms silence = end of turn
allow_interruptions=True, # Let user interrupt agent
interrupt_min_words=3, # Min words before allowing interrupt
)
Manual Interruption Control
class InterruptibleAssistant(Agent):
async def on_user_speech_started(self):
"""Called when user starts speaking."""
# Stop current TTS playback
self.session.interrupt()
async def on_interrupted(self):
"""Called when agent speech is interrupted."""
print("User interrupted me")
# Optionally resume or acknowledge
Greeting on Connect
class GreetingAssistant(Agent):
def __init__(self):
super().__init__(
instructions="You are a helpful assistant."
)
async def on_enter(self):
"""Greet user when entering room."""
await self.session.generate_reply(
"Hello! I'm your AI assistant. How can I help you today?"
)
Streaming Response
class StreamingAssistant(Agent):
async def on_user_turn(self, message: str):
"""Handle user message with streaming."""
# Stream LLM response directly to TTS
async for chunk in self.session.llm.stream(message):
# Each chunk is sent to TTS immediately
pass
# Or use default behavior which handles this
await super().on_user_turn(message)
Multi-Language Support
@RtcSession.factory
async def create_session(session: AgentSession):
# Detect language from first utterance
stt = "deepgram/nova-3:multi" # Multilingual mode
await session.start(
agent=MultilingualAssistant(),
stt=stt,
llm="openai/gpt-4o", # Multilingual LLM
tts="cartesia/sonic", # Will speak detected language
)
class MultilingualAssistant(Agent):
def __init__(self):
super().__init__(
instructions="""You are a multilingual assistant.
Respond in the same language the user speaks.
Support English, Spanish, French, German, Japanese."""
)
Latency Optimization
@RtcSession.factory
async def create_session(session: AgentSession):
await session.start(
agent=FastAssistant(),
# Fast STT
stt="deepgram/nova-3", # Low latency
# Fast LLM
llm="openai/gpt-4o-mini", # Fastest OpenAI
# Fast TTS with streaming
tts="cartesia/sonic", # Real-time streaming
# Aggressive turn detection
min_endpointing_delay=0.3, # 300ms
# Noise cancellation
noise_cancellation=noise_cancellation.BVC(),
)
class FastAssistant(Agent):
def __init__(self):
super().__init__(
instructions="""Be extremely concise.
One sentence responses only.
Never use filler words."""
)
Metrics and Monitoring
@RtcSession.factory
async def create_session(session: AgentSession):
@session.on("metrics")
def on_metrics(metrics):
print(f"STT latency: {metrics.stt_latency_ms}ms")
print(f"LLM latency: {metrics.llm_latency_ms}ms")
print(f"TTS latency: {metrics.tts_latency_ms}ms")
print(f"Total latency: {metrics.total_latency_ms}ms")
await session.start(
agent=VoiceAssistant(),
stt="deepgram/nova-3",
llm="openai/gpt-4o-mini",
tts="cartesia/sonic",
)
Complete Voice Agent
import logging
from dotenv import load_dotenv
from livekit.agents import Agent, AgentSession, RtcSession, AgentServer
from livekit.plugins import silero, noise_cancellation
load_dotenv(dotenv_path=".env.local")
logging.basicConfig(level=logging.INFO)
class VoiceAssistant(Agent):
def __init__(self):
super().__init__(
instructions="""You are a helpful, friendly voice assistant.
- Keep responses concise (1-2 sentences)
- Speak naturally and conversationally
- Ask clarifying questions when needed
- Be patient and supportive"""
)
async def on_enter(self):
"""Greet user on connection."""
await self.session.generate_reply(
"Hi there! I'm ready to help. What can I do for you?"
)
async def on_user_turn(self, message: str):
"""Process user message."""
logging.info(f"User: {message}")
# Default behavior handles LLM + TTS
await super().on_user_turn(message)
async def on_exit(self):
"""Cleanup on disconnect."""
logging.info("User disconnected")
@RtcSession.factory
async def create_session(session: AgentSession):
@session.on("user_speech_started")
def on_speech_start():
logging.info("User started speaking")
@session.on("agent_speech_ended")
def on_agent_end():
logging.info("Agent finished response")
await session.start(
agent=VoiceAssistant(),
stt="deepgram/nova-3",
llm="openai/gpt-4o-mini",
tts="cartesia/sonic",
vad=silero.VAD.load(),
noise_cancellation=noise_cancellation.BVC(),
turn_detection="server_vad",
min_endpointing_delay=0.5,
allow_interruptions=True,
)
if __name__ == "__main__":
AgentServer(create_session).run()
Best Practices
- Optimize for latency: Use fast models (Nova-3, GPT-4o-mini, Sonic)
- Keep instructions short: Faster LLM responses
- Use noise cancellation: Improves STT accuracy
- Test turn detection: Adjust timing for your use case
- Handle interruptions: Natural conversation flow
Deliverables
For: $ARGUMENTS
Provide:
- STT model selection
- TTS voice configuration
- LLM setup with instructions
- Turn detection configuration
- Interruption handling
- Latency optimization