Text To Speech Streaming
An overview of the Deepgram Python SDK and Deepgram streaming text-to-speech.
The Deepgram Speak Clients
allows you to generate human-like audio from text.
This SDK supports both the Threaded and Async/Await Clients as described in the Threaded and Async IO Task Support section. The code blocks contain a tab for
Threaded
andAsync
to show examples forwebsocket
versusasyncwebsocket
, respectively. The difference betweenThreaded
andAsync
is subtle.
Installing the SDK
# Install the Deepgram Python SDK
# https://github.com/deepgram/deepgram-python-sdk
pip install deepgram-sdk==3.*
Make a Deepgram Text-to-Speech Request
The Deepgram Speak Clients
allows you to create audio generated from provided text and stream the audio bytes through your AudioData
callback function. You can also subscribe to other important events, such as:
Metadata
- obtain information about the audio stream.Flushed
- receive an acknowledgment when all the text has been received by Deepgram to convert to audio. This is the result of sending theFlush
message via theflush()
function.Cleared
- that the text buffer has been cleared from Deepgram server. This is the result of sending theClear
message via theclear()
function.
from deepgram.utils import verboselogs
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
SpeakWebSocketEvents,
SpeakWSOptions,
)
TTS_TEXT = "Hello, this is a text to speech example using Deepgram."
global warning_notice
warning_notice = True
def main():
try:
# create a deepgram client
config: DeepgramClientOptions = DeepgramClientOptions(
options={"speaker_playback": "true"},
)
deepgram: DeepgramClient = DeepgramClient("", config)
# Create a websocket connection to Deepgram
dg_connection = deepgram.speak.websocket.v("1")
def on_open(self, open, **kwargs):
print(f"\n\n{open}\n\n")
def on_binary_data(self, data, **kwargs):
global warning_notice
if warning_notice:
print("Received binary data")
print("You can do something with the binary data here")
print("OR")
print(
"If you want to simply play the audio, set speaker_playback to true in the options for DeepgramClientOptions"
)
warning_notice = False
def on_metadata(self, metadata, **kwargs):
print(f"\n\n{metadata}\n\n")
def on_flush(self, flushed, **kwargs):
print(f"\n\n{flushed}\n\n")
def on_clear(self, clear, **kwargs):
print(f"\n\n{clear}\n\n")
def on_close(self, close, **kwargs):
print(f"\n\n{close}\n\n")
def on_warning(self, warning, **kwargs):
print(f"\n\n{warning}\n\n")
def on_error(self, error, **kwargs):
print(f"\n\n{error}\n\n")
def on_unhandled(self, unhandled, **kwargs):
print(f"\n\n{unhandled}\n\n")
dg_connection.on(SpeakWebSocketEvents.Open, on_open)
dg_connection.on(SpeakWebSocketEvents.AudioData, on_binary_data)
dg_connection.on(SpeakWebSocketEvents.Metadata, on_metadata)
dg_connection.on(SpeakWebSocketEvents.Flushed, on_flush)
dg_connection.on(SpeakWebSocketEvents.Cleared, on_clear)
dg_connection.on(SpeakWebSocketEvents.Close, on_close)
dg_connection.on(SpeakWebSocketEvents.Error, on_error)
dg_connection.on(SpeakWebSocketEvents.Warning, on_warning)
dg_connection.on(SpeakWebSocketEvents.Unhandled, on_unhandled)
# connect to websocket
options = SpeakWSOptions(
model="aura-asteria-en",
encoding="linear16",
sample_rate=16000,
)
print("\n\nPress Enter to stop...\n\n")
if dg_connection.start(options) is False:
print("Failed to start connection")
return
# send the text to Deepgram
dg_connection.send_text(TTS_TEXT)
# if auto_flush_speak_delta is not used, you must flush the connection by calling flush()
dg_connection.flush()
# Indicate that we've finished
dg_connection.wait_for_complete()
print("\n\nPress Enter to stop...\n\n")
input()
# Close the connection
dg_connection.finish()
print("Finished")
except Exception as e:
print(f"An unexpected error occurred: {e}")
if __name__ == "__main__":
main()
import asyncio
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
SpeakWebSocketEvents,
SpeakWSOptions,
)
TTS_TEXT = "Hello, this is a text to speech example using Deepgram."
global warning_notice
warning_notice = True
async def main():
try:
# create a deepgram client
config: DeepgramClientOptions = DeepgramClientOptions(
options={"speaker_playback": "true"}
)
deepgram: DeepgramClient = DeepgramClient("", config)
# Create a websocket connection to Deepgram
dg_connection = deepgram.speak.asyncwebsocket.v("1")
async def on_open(self, open, **kwargs):
print(f"\n\n{open}\n\n")
async def on_binary_data(self, data, **kwargs):
global warning_notice
if warning_notice:
print("Received binary data")
print("You can do something with the binary data here")
print("OR")
print(
"If you want to simply play the audio, set speaker_playback to true in the options for DeepgramClientOptions"
)
warning_notice = False
async def on_metadata(self, metadata, **kwargs):
print(f"\n\n{metadata}\n\n")
async def on_flush(self, flushed, **kwargs):
print(f"\n\n{flushed}\n\n")
async def on_clear(self, clear, **kwargs):
print(f"\n\n{clear}\n\n")
async def on_close(self, close, **kwargs):
print(f"\n\n{close}\n\n")
async def on_warning(self, warning, **kwargs):
print(f"\n\n{warning}\n\n")
async def on_error(self, error, **kwargs):
print(f"\n\n{error}\n\n")
async def on_unhandled(self, unhandled, **kwargs):
print(f"\n\n{unhandled}\n\n")
dg_connection.on(SpeakWebSocketEvents.Open, on_open)
dg_connection.on(SpeakWebSocketEvents.AudioData, on_binary_data)
dg_connection.on(SpeakWebSocketEvents.Metadata, on_metadata)
dg_connection.on(SpeakWebSocketEvents.Flushed, on_flush)
dg_connection.on(SpeakWebSocketEvents.Cleared, on_clear)
dg_connection.on(SpeakWebSocketEvents.Close, on_close)
dg_connection.on(SpeakWebSocketEvents.Error, on_error)
dg_connection.on(SpeakWebSocketEvents.Warning, on_warning)
dg_connection.on(SpeakWebSocketEvents.Unhandled, on_unhandled)
# connect to websocket
options = SpeakWSOptions(
model="aura-asteria-en",
encoding="linear16",
sample_rate=16000,
)
print("\n\nPress Enter to stop...\n\n")
if await dg_connection.start(options) is False:
print("Failed to start connection")
return
# send the text to Deepgram
await dg_connection.send_text(TTS_TEXT)
# if auto_flush_speak_delta is not used, you must flush the connection by calling flush()
await dg_connection.flush()
# Indicate that we've finished
await dg_connection.wait_for_complete()
# Close the connection
await dg_connection.finish()
print("Finished")
except Exception as e:
print(f"An unexpected error occurred: {e}")
asyncio.run(main())
Audio Output Streaming
The audio bytes representing the converted text will stream or be passed to the client via the above AudioData
event using the callback function.
It should be noted that these audio bytes are:
- Container-less audio. Meaning depending on the
encoding
value chosen, only the raw audio data is sent. As an example, if you chooselinear16
as yourencoding
for audio, aWAV
header will not be sent. Please see the Tips and Tricks for more information. - Not of standard size/length when received by the client. This is because the text is broken down into sounds representing the speech. Certain sounds chained together to form fragments of spoken words are different in length and content.
Depending on what the use case is for the generated audio bytes, please visit one of these guides to better help utilize these audio bytes for your use case:
- Sending LLM Outputs to a WebSocket
- Text Chunking for Streaming TTS Optimization
- Handling Audio Issues in Text To Speech
Where to Find Additional Examples
The SDK repository has a good collection of text-to-speech examples. The README contains links to them. Each example below attempts to provide different options for transcribing an audio source.
Some Examples:
- Threaded Client speaking "Hello World" - examples/text-to-speech/websocket/complete
If the Async Client suits your use case better:
- Threaded Client speaking "Hello World" - examples/text-to-speech/websocket/async_complete
Updated 4 months ago