Text To Speech Streaming

An overview of the Deepgram Python SDK and Deepgram streaming text-to-speech.

The Deepgram Speak Clients allows you to generate human-like audio from text.

📘

This SDK supports both the Threaded and Async/Await Clients as described in the Threaded and Async IO Task Support section. The code blocks contain a tab for Threaded and Async to show examples for websocket versus asyncwebsocket, respectively. The difference between Threaded and Async is subtle.

Installing the SDK

# Install the Deepgram Python SDK
# https://github.com/deepgram/deepgram-python-sdk

pip install deepgram-sdk==3.*

Make a Deepgram Text-to-Speech Request

The Deepgram Speak Clients allows you to create audio generated from provided text and stream the audio bytes through your AudioData callback function. You can also subscribe to other important events, such as:

  • Metadata - obtain information about the audio stream.
  • Flushed - receive an acknowledgment when all the text has been received by Deepgram to convert to audio. This is the result of sending the Flush message via the flush() function.
  • Cleared - that the text buffer has been cleared from Deepgram server. This is the result of sending the Clear message via the clear() function.
from deepgram.utils import verboselogs

from deepgram import (
    DeepgramClient,
    DeepgramClientOptions,
    SpeakWebSocketEvents,
    SpeakWSOptions,
)

TTS_TEXT = "Hello, this is a text to speech example using Deepgram."

global warning_notice
warning_notice = True

def main():
    try:
        # create a deepgram client
        config: DeepgramClientOptions = DeepgramClientOptions(
            options={"speaker_playback": "true"},
        )
        deepgram: DeepgramClient = DeepgramClient("", config)

        # Create a websocket connection to Deepgram
        dg_connection = deepgram.speak.websocket.v("1")

        def on_open(self, open, **kwargs):
            print(f"\n\n{open}\n\n")

        def on_binary_data(self, data, **kwargs):
            global warning_notice
            if warning_notice:
                print("Received binary data")
                print("You can do something with the binary data here")
                print("OR")
                print(
                    "If you want to simply play the audio, set speaker_playback to true in the options for DeepgramClientOptions"
                )
                warning_notice = False

        def on_metadata(self, metadata, **kwargs):
            print(f"\n\n{metadata}\n\n")

        def on_flush(self, flushed, **kwargs):
            print(f"\n\n{flushed}\n\n")

        def on_clear(self, clear, **kwargs):
            print(f"\n\n{clear}\n\n")

        def on_close(self, close, **kwargs):
            print(f"\n\n{close}\n\n")

        def on_warning(self, warning, **kwargs):
            print(f"\n\n{warning}\n\n")

        def on_error(self, error, **kwargs):
            print(f"\n\n{error}\n\n")

        def on_unhandled(self, unhandled, **kwargs):
            print(f"\n\n{unhandled}\n\n")

        dg_connection.on(SpeakWebSocketEvents.Open, on_open)
        dg_connection.on(SpeakWebSocketEvents.AudioData, on_binary_data)
        dg_connection.on(SpeakWebSocketEvents.Metadata, on_metadata)
        dg_connection.on(SpeakWebSocketEvents.Flushed, on_flush)
        dg_connection.on(SpeakWebSocketEvents.Cleared, on_clear)
        dg_connection.on(SpeakWebSocketEvents.Close, on_close)
        dg_connection.on(SpeakWebSocketEvents.Error, on_error)
        dg_connection.on(SpeakWebSocketEvents.Warning, on_warning)
        dg_connection.on(SpeakWebSocketEvents.Unhandled, on_unhandled)

        # connect to websocket
        options = SpeakWSOptions(
            model="aura-asteria-en",
            encoding="linear16",
            sample_rate=16000,
        )

        print("\n\nPress Enter to stop...\n\n")
        if dg_connection.start(options) is False:
            print("Failed to start connection")
            return

        # send the text to Deepgram
        dg_connection.send_text(TTS_TEXT)

        # if auto_flush_speak_delta is not used, you must flush the connection by calling flush()
        dg_connection.flush()

        # Indicate that we've finished
        dg_connection.wait_for_complete()

        print("\n\nPress Enter to stop...\n\n")
        input()

        # Close the connection
        dg_connection.finish()

        print("Finished")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    main()
import asyncio

from deepgram import (
    DeepgramClient,
    DeepgramClientOptions,
    SpeakWebSocketEvents,
    SpeakWSOptions,
)

TTS_TEXT = "Hello, this is a text to speech example using Deepgram."

global warning_notice
warning_notice = True

async def main():
    try:
        # create a deepgram client
        config: DeepgramClientOptions = DeepgramClientOptions(
            options={"speaker_playback": "true"}
        )
        deepgram: DeepgramClient = DeepgramClient("", config)

        # Create a websocket connection to Deepgram
        dg_connection = deepgram.speak.asyncwebsocket.v("1")

        async def on_open(self, open, **kwargs):
            print(f"\n\n{open}\n\n")

        async def on_binary_data(self, data, **kwargs):
            global warning_notice
            if warning_notice:
                print("Received binary data")
                print("You can do something with the binary data here")
                print("OR")
                print(
                    "If you want to simply play the audio, set speaker_playback to true in the options for DeepgramClientOptions"
                )
                warning_notice = False

        async def on_metadata(self, metadata, **kwargs):
            print(f"\n\n{metadata}\n\n")

        async def on_flush(self, flushed, **kwargs):
            print(f"\n\n{flushed}\n\n")

        async def on_clear(self, clear, **kwargs):
            print(f"\n\n{clear}\n\n")

        async def on_close(self, close, **kwargs):
            print(f"\n\n{close}\n\n")

        async def on_warning(self, warning, **kwargs):
            print(f"\n\n{warning}\n\n")

        async def on_error(self, error, **kwargs):
            print(f"\n\n{error}\n\n")

        async def on_unhandled(self, unhandled, **kwargs):
            print(f"\n\n{unhandled}\n\n")

        dg_connection.on(SpeakWebSocketEvents.Open, on_open)
        dg_connection.on(SpeakWebSocketEvents.AudioData, on_binary_data)
        dg_connection.on(SpeakWebSocketEvents.Metadata, on_metadata)
        dg_connection.on(SpeakWebSocketEvents.Flushed, on_flush)
        dg_connection.on(SpeakWebSocketEvents.Cleared, on_clear)
        dg_connection.on(SpeakWebSocketEvents.Close, on_close)
        dg_connection.on(SpeakWebSocketEvents.Error, on_error)
        dg_connection.on(SpeakWebSocketEvents.Warning, on_warning)
        dg_connection.on(SpeakWebSocketEvents.Unhandled, on_unhandled)

        # connect to websocket
        options = SpeakWSOptions(
            model="aura-asteria-en",
            encoding="linear16",
            sample_rate=16000,
        )

        print("\n\nPress Enter to stop...\n\n")
        if await dg_connection.start(options) is False:
            print("Failed to start connection")
            return

        # send the text to Deepgram
        await dg_connection.send_text(TTS_TEXT)

        # if auto_flush_speak_delta is not used, you must flush the connection by calling flush()
        await dg_connection.flush()

        # Indicate that we've finished
        await dg_connection.wait_for_complete()

        # Close the connection
        await dg_connection.finish()

        print("Finished")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

asyncio.run(main())

Audio Output Streaming

The audio bytes representing the converted text will stream or be passed to the client via the above AudioData event using the callback function.

It should be noted that these audio bytes are:

  • Container-less audio. Meaning depending on the encoding value chosen, only the raw audio data is sent. As an example, if you choose linear16 as your encoding for audio, a WAV header will not be sent. Please see the Tips and Tricks for more information.
  • Not of standard size/length when received by the client. This is because the text is broken down into sounds representing the speech. Certain sounds chained together to form fragments of spoken words are different in length and content.

Depending on what the use case is for the generated audio bytes, please visit one of these guides to better help utilize these audio bytes for your use case:

Where to Find Additional Examples

The SDK repository has a good collection of text-to-speech examples. The README contains links to them. Each example below attempts to provide different options for transcribing an audio source.

Some Examples:

If the Async Client suits your use case better: