Real-Time TTS with WebSockets

Why Use WebSockets for TTS?

WebSockets provide a continuous audio stream flowing directly to the playback device without saving files to disk. This approach is essential for voice agents and conversational AI that require minimal latency and natural-sounding speech.

Key benefits include low latency, which allows audio playback to begin as soon as the first data chunk arrives, continuous streaming that maintains a persistent connection for rapid audio delivery, and efficient processing by streaming audio directly to playback devices.

WebSocket Implementation Examples

The following examples demonstrate how to implement real-time TTS using Deepgram’s WebSocket API:

1 import sounddevice as sd
2 import numpy as np
3 import time
4 
5 from deepgram import (
6     DeepgramClient,
7     SpeakWebSocketEvents,
8     SpeakOptions,
9 )
10 
11 TTS_TEXT = "Hello, this is a text to speech example using Deepgram."
12 
13 def main():
14     try:
15         # Create a Deepgram client using the API key from environment variables
16         deepgram = DeepgramClient()
17 
18         # Create a websocket connection to Deepgram
19         dg_connection = deepgram.speak.websocket.v("1")
20 
21         def on_open(self, open, **kwargs):
22             print(f"Connection opened: {open}")
23 
24         def on_binary_data(self, data, **kwargs):
25             print("Received audio chunk")
26             # Convert binary data to audio format playback devices understand
27             array = np.frombuffer(data, dtype=np.int16)
28             # Play the audio immediately upon receiving each chunk
29             sd.play(array, 48000)
30             sd.wait()
31 
32         def on_close(self, close, **kwargs):
33             print(f"Connection closed: {close}")
34 
35         # Set up event handlers
36         dg_connection.on(SpeakWebSocketEvents.Open, on_open)
37         dg_connection.on(SpeakWebSocketEvents.AudioData, on_binary_data)
38         dg_connection.on(SpeakWebSocketEvents.Close, on_close)
39 
40         # Configure audio options - linear16 is recommended for real-time applications
41         options = SpeakOptions(
42             model="aura-2-thalia-en",
43             encoding="linear16",
44             sample_rate=48000,
45         )
46 
47         # Start the connection
48         if dg_connection.start(options) is False:
49             print("Failed to start connection")
50             return
51 
52         # Send text to be converted to speech
53         dg_connection.send_text(TTS_TEXT)
54         dg_connection.flush()
55 
56         # Allow time for playback
57         time.sleep(5)
58         
59         # Clean up
60         dg_connection.finish()
61         print("TTS stream completed")
62 
63     except Exception as e:
64         print(f"An error occurred: {e}")
65 
66 if __name__ == "__main__":
67     main()

For optimal text handling, see our guide on Text Chunking for TTS.