Getting Started

An introduction to using Deepgram's Aura Streaming Text-to-Speech Websocket API to convert streaming text into audio.

This guide will walk you through how to turn streaming text into speech with Deepgram's text-to-speech Websocket API.

📘

Before you start, you'll need to follow the steps in the Make Your First API Request guide to obtain a Deepgram API key, and configure your environment if you are choosing to use a Deepgram SDK.

Text-to-Speech Implementations

Deepgram has several SDKs that can make the API easier to use. Follow these steps to use the SDK of your choice to make a Deepgram TTS request.

Add Dependencies

# Install the dependencies
pip install pyaudio==0.2.14
# Install the SDK
npm install @deepgram/sdk

# Add the dependencies
npm install dotenv
# Install the SDK
pip install deepgram-sdk==3.*

# Install dependencies required by the examples
pip install -r examples/requirements-examples.py
# Install the SDK
go get github.com/deepgram/deepgram-go-sdk

# Importing the Deepgram Go SDK should pull in all dependencies required
# Install the SDK
dotnet add package Deepgram

# Importing the Deepgram .NET SDK should pull in all dependencies required

Make the Request with the SDK

import json
import os
import threading
import asyncio
import queue

import websockets
from websockets.sync.client import connect

import pyaudio

TIMEOUT = 0.050
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 48000
CHUNK = 8000

DEFAULT_URL = f"wss://api.deepgram.com/v1/speak?encoding=linear16&sample_rate={RATE}"
DEFAULT_TOKEN = os.environ.get("DEEPGRAM_API_KEY", None)

def main():
    print(f"Connecting to {DEFAULT_URL}")

    _socket = connect(
        DEFAULT_URL, additional_headers={"Authorization": f"Token {DEFAULT_TOKEN}"}
    )
    _exit = threading.Event()

    _story = [
        "The sun had just begun to rise over the sleepy town of Millfield.",
        "Emily a young woman in her mid-twenties was already awake and bustling about.",
    ]

    async def receiver():
        speaker = Speaker()
        speaker.start()
        try:
            while True:
                if _socket is None or _exit.is_set():
                    break

                message = _socket.recv()
                if message is None:
                    continue

                if type(message) is str:
                    print(message)
                elif type(message) is bytes:
                    speaker.play(message)
        except Exception as e:
            print(f"receiver: {e}")
        finally:
            speaker.stop()

    _receiver_thread = threading.Thread(target=asyncio.run, args=(receiver(),))
    _receiver_thread.start()

    for text_input in _story:
        print(f"Sending: {text_input}")
        _socket.send(json.dumps({"type": "Speak", "text": text_input}))

    print("Flushing...")
    _socket.send(json.dumps({"type": "Flush"}))

    input("Press Enter to exit...")
    _exit.set()
    _socket.send(json.dumps({"type": "Close"}))
    _socket.close()

    _listen_thread.join()
    _listen_thread = None


class Speaker:
    _audio: pyaudio.PyAudio
    _chunk: int
    _rate: int
    _format: int
    _channels: int
    _output_device_index: int

    _stream: pyaudio.Stream
    _thread: threading.Thread
    _asyncio_loop: asyncio.AbstractEventLoop
    _asyncio_thread: threading.Thread
    _queue: queue.Queue
    _exit: threading.Event

    def __init__(
        self,
        rate: int = RATE,
        chunk: int = CHUNK,
        channels: int = CHANNELS,
        output_device_index: int = None,
    ):
        self._exit = threading.Event()
        self._queue = queue.Queue()

        self._audio = pyaudio.PyAudio()
        self._chunk = chunk
        self._rate = rate
        self._format = FORMAT
        self._channels = channels
        self._output_device_index = output_device_index

    def _start_asyncio_loop(self) -> None:
        self._asyncio_loop = asyncio.new_event_loop()
        self._asyncio_loop.run_forever()

    def start(self) -> bool:
        self._stream = self._audio.open(
            format=self._format,
            channels=self._channels,
            rate=self._rate,
            input=False,
            output=True,
            frames_per_buffer=self._chunk,
            output_device_index=self._output_device_index,
        )

        self._exit.clear()

        self._thread = threading.Thread(
            target=_play, args=(self._queue, self._stream, self._exit), daemon=True
        )
        self._thread.start()

        self._stream.start_stream()

        return True

    def stop(self):
        self._exit.set()

        if self._stream is not None:
            self._stream.stop_stream()
            self._stream.close()
            self._stream = None

        self._thread.join()
        self._thread = None

        self._queue = None

    def play(self, data):
        self._queue.put(data)


def _play(audio_out: queue, stream, stop):
    while not stop.is_set():
        try:
            data = audio_out.get(True, TIMEOUT)
            stream.write(data)
        except queue.Empty as e:
            # print(f"queue is empty")
            pass
        except Exception as e:
            print(f"_play: {e}")

if __name__ == "__main__":
    main()
const fs = require("fs");
const { createClient, LiveTTSEvents } = require("../../dist/main/index");

const live = async () => {
  const text = "Hello, how can I help you today?";

  const deepgram = createClient(process.env.DEEPGRAM_API_KEY);

  const dgConnection = deepgram.speak.live({ model: "aura-asteria-en" });

  let audioBuffer = Buffer.alloc(0);

  dgConnection.on(LiveTTSEvents.Open, () => {
    console.log("Connection opened");

    // Send text data for TTS synthesis
    dgConnection.sendText(text);

    // Send Flush message to the server after sending the text
    dgConnection.flush();

    dgConnection.on(LiveTTSEvents.Close, () => {
      console.log("Connection closed");
    });

    dgConnection.on(LiveTTSEvents.Metadata, (data) => {
      console.dir(data, { depth: null });
    });

    dgConnection.on(LiveTTSEvents.Audio, (data) => {
      console.log("Deepgram audio data received");
      // Concatenate the audio chunks into a single buffer
      const buffer = Buffer.from(data);
      audioBuffer = Buffer.concat([audioBuffer, buffer]);
    });

    dgConnection.on(LiveTTSEvents.Flushed, () => {
      console.log("Deepgram Flushed");
      // Write the buffered audio data to a file when the flush event is received
      writeFile();
    });

    dgConnection.on(LiveTTSEvents.Error, (err) => {
      console.error(err);
    });
  });

  const writeFile = () => {
    if (audioBuffer.length > 0) {
      fs.writeFile("output.mp3", audioBuffer, (err) => {
        if (err) {
          console.error("Error writing audio file:", err);
        } else {
          console.log("Audio file saved as output.mp3");
        }
      });
      audioBuffer = Buffer.alloc(0); // Reset buffer after writing
    }
  };
};

live();

import time
from deepgram.utils import verboselogs
import wave

from deepgram import (
    DeepgramClient,
    SpeakWebSocketEvents,
    SpeakWSOptions,
)

AUDIO_FILE = "output.wav"
TTS_TEXT = "Hello, this is a text to speech example using Deepgram. How are you doing today? I am fine thanks for asking."

def main():
    try:
        # use default config
        deepgram: DeepgramClient = DeepgramClient()

        # Create a websocket connection to Deepgram
        dg_connection = deepgram.speak.websocket.v("1")

        def on_binary_data(self, data, **kwargs):
            print("Received binary data")
            with open(AUDIO_FILE, "ab") as f:
                f.write(data)
                f.flush()

        dg_connection.on(SpeakWebSocketEvents.AudioData, on_binary_data)

        # Generate a generic WAV container header
        # since we don't support containerized audio, we need to generate a header
        header = wave.open(AUDIO_FILE, "wb")
        header.setnchannels(1)  # Mono audio
        header.setsampwidth(2)  # 16-bit audio
        header.setframerate(16000)  # Sample rate of 16000 Hz
        header.close()

        # connect to websocket
        options = SpeakWSOptions(
            model="aura-asteria-en",
            encoding="linear16",
            sample_rate=16000,
        )

        print("\n\nPress Enter to stop...\n\n")
        if dg_connection.start(options) is False:
            print("Failed to start connection")
            return

        # send the text to Deepgram
        dg_connection.send_text(TTS_TEXT)
        
        # if auto_flush_speak_delta is not used, you must flush the connection by calling flush()
        dg_connection.flush()

        # Indicate that we've finished
        time.sleep(7)
        print("\n\nPress Enter to stop...\n\n")
        input()

        # Close the connection
        dg_connection.finish()

        print("Finished")

    except ValueError as e:
        print(f"Invalid value encountered: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    main()
package main

import (
	"context"
	"fmt"
	"os"
	"strings"

	msginterfaces "github.com/deepgram/deepgram-go-sdk/pkg/api/speak/v1/websocket/interfaces"
	interfaces "github.com/deepgram/deepgram-go-sdk/pkg/client/interfaces/v1"
	speak "github.com/deepgram/deepgram-go-sdk/pkg/client/speak"
)

const (
	TTS_TEXT   = "Hello, this is a text to speech example using Deepgram."
	AUDIO_FILE = "output.mp3"
)

// Implement your own callback
type MyCallback struct{}

func (c MyCallback) Metadata(md *msginterfaces.MetadataResponse) error {
	fmt.Printf("\n[Metadata] Received\n")
	fmt.Printf("Metadata.RequestID: %s\n", strings.TrimSpace(md.RequestID))
	return nil
}

func (c MyCallback) Binary(byMsg []byte) error {
	fmt.Printf("\n[Binary] Received\n")

	file, err := os.OpenFile(AUDIO_FILE, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o666)
	if err != nil {
		fmt.Printf("Error creating file %s: %v\n", AUDIO_FILE, err)
		return err
	}
	defer file.Close()

	_, err = file.Write(byMsg)
	if err != nil {
		fmt.Printf("Error writing audio data to file: %v\n", err)
		return err
	}

	fmt.Printf("Audio data saved to %s\n", AUDIO_FILE)
	return nil
}

func (c MyCallback) Flush(fl *msginterfaces.FlushedResponse) error {
	fmt.Printf("\n[Flushed] Received\n")
	return nil
}

func (c MyCallback) Warning(wr *msginterfaces.WarningResponse) error {
	fmt.Printf("\n[Warning] Received\n")
	fmt.Printf("Warning.Code: %s\n", wr.WarnCode)
	fmt.Printf("Warning.Description: %s\n\n", wr.WarnMsg)
	return nil
}

func (c MyCallback) Error(er *msginterfaces.ErrorResponse) error {
	fmt.Printf("\n[Error] Received\n")
	fmt.Printf("Error.Code: %s\n", er.ErrCode)
	fmt.Printf("Error.Description: %s\n\n", er.ErrMsg)
	return nil
}

func (c MyCallback) Close(cr *msginterfaces.CloseResponse) error {
	fmt.Printf("\n[Close] Received\n")
	return nil
}

func (c MyCallback) Open(or *msginterfaces.OpenResponse) error {
	fmt.Printf("\n[Open] Received\n")
	return nil
}

func main() {
	// print instructions
	fmt.Print("\n\nPress ENTER to exit!\n\n")
  
	// init library
	speak.InitWithDefault()

	// Go context
	ctx := context.Background()

	// set the TTS options
	ttsOptions := &interfaces.SpeakOptions{
		Model: "aura-asteria-en",
	}

	// create the callback
	callback := MyCallback{}

	// create a new stream using the NewStream function
	dgClient, err := speak.NewWebSocketWithDefaults(ctx, ttsOptions, callback)
	if err != nil {
		fmt.Println("ERROR creating TTS connection:", err)
		return
	}

	// connect the websocket to Deepgram
	bConnected := dgClient.Connect()
	if !bConnected {
		fmt.Println("Client.Connect failed")
		os.Exit(1)
	}

	// Send the text input
	err = dgClient.SpeakWithText(TTS_TEXT)
	if err != nil {
		fmt.Printf("Error sending text input: %v\n", err)
		return
	}

	// Flush the text input
	err = dgClient.Flush()
	if err != nil {
		fmt.Printf("Error sending text input: %v\n", err)
		return
	}

	// wait for user input to exit
	input := bufio.NewScanner(os.Stdin)
	input.Scan()

	// close the connection
	dgClient.Stop()

	fmt.Printf("Program exiting...\n")
}
using Deepgram.Models.Authenticate.v1;
using Deepgram.Models.Speak.v2.WebSocket;
using Deepgram.Logger;

namespace SampleApp
{
    class Program
    {
        static async Task Main(string[] args)
        {
            try
            {
                // Initialize Library with default logging
                // Normal logging is "Info" level
                Library.Initialize();

                // use the client factory with a API Key set with the "DEEPGRAM_API_KEY" environment variable
                var speakClient = ClientFactory.CreateSpeakWebSocketClient();

                // append wav header only once
                bool appendWavHeader = true;

                // Subscribe to the EventResponseReceived event
                await speakClient.Subscribe(new EventHandler<AudioResponse>((sender, e) =>
                {
                    Console.WriteLine($"----> {e.Type} received");

                    // add a wav header
                    if (appendWavHeader)
                    {
                        using (BinaryWriter writer = new BinaryWriter(File.Open("output.wav", FileMode.Append)))
                        {
                            Console.WriteLine("Adding WAV header to output.wav");
                            byte[] wavHeader = new byte[44];
                            int sampleRate = 48000;
                            short bitsPerSample = 16;
                            short channels = 1;
                            int byteRate = sampleRate * channels * (bitsPerSample / 8);
                            short blockAlign = (short)(channels * (bitsPerSample / 8));

                            wavHeader[0] = 0x52; // R
                            wavHeader[1] = 0x49; // I
                            wavHeader[2] = 0x46; // F
                            wavHeader[3] = 0x46; // F
                            wavHeader[4] = 0x00; // Placeholder for file size (will be updated later)
                            wavHeader[5] = 0x00; // Placeholder for file size (will be updated later)
                            wavHeader[6] = 0x00; // Placeholder for file size (will be updated later)
                            wavHeader[7] = 0x00; // Placeholder for file size (will be updated later)
                            wavHeader[8] = 0x57; // W
                            wavHeader[9] = 0x41; // A
                            wavHeader[10] = 0x56; // V
                            wavHeader[11] = 0x45; // E
                            wavHeader[12] = 0x66; // f
                            wavHeader[13] = 0x6D; // m
                            wavHeader[14] = 0x74; // t
                            wavHeader[15] = 0x20; // Space
                            wavHeader[16] = 0x10; // Subchunk1Size (16 for PCM)
                            wavHeader[17] = 0x00; // Subchunk1Size
                            wavHeader[18] = 0x00; // Subchunk1Size
                            wavHeader[19] = 0x00; // Subchunk1Size
                            wavHeader[20] = 0x01; // AudioFormat (1 for PCM)
                            wavHeader[21] = 0x00; // AudioFormat
                            wavHeader[22] = (byte)channels; // NumChannels
                            wavHeader[23] = 0x00; // NumChannels
                            wavHeader[24] = (byte)(sampleRate & 0xFF); // SampleRate
                            wavHeader[25] = (byte)((sampleRate >> 8) & 0xFF); // SampleRate
                            wavHeader[26] = (byte)((sampleRate >> 16) & 0xFF); // SampleRate
                            wavHeader[27] = (byte)((sampleRate >> 24) & 0xFF); // SampleRate
                            wavHeader[28] = (byte)(byteRate & 0xFF); // ByteRate
                            wavHeader[29] = (byte)((byteRate >> 8) & 0xFF); // ByteRate
                            wavHeader[30] = (byte)((byteRate >> 16) & 0xFF); // ByteRate
                            wavHeader[31] = (byte)((byteRate >> 24) & 0xFF); // ByteRate
                            wavHeader[32] = (byte)blockAlign; // BlockAlign
                            wavHeader[33] = 0x00; // BlockAlign
                            wavHeader[34] = (byte)bitsPerSample; // BitsPerSample
                            wavHeader[35] = 0x00; // BitsPerSample
                            wavHeader[36] = 0x64; // d
                            wavHeader[37] = 0x61; // a
                            wavHeader[38] = 0x74; // t
                            wavHeader[39] = 0x61; // a
                            wavHeader[40] = 0x00; // Placeholder for data chunk size (will be updated later)
                            wavHeader[41] = 0x00; // Placeholder for data chunk size (will be updated later)
                            wavHeader[42] = 0x00; // Placeholder for data chunk size (will be updated later)
                            wavHeader[43] = 0x00; // Placeholder for data chunk size (will be updated later)

                            writer.Write(wavHeader);
                            appendWavHeader = false;
                        }
                    }

                    if (e.Stream != null)
                    {
                        using (BinaryWriter writer = new BinaryWriter(File.Open("output.wav", FileMode.Append)))
                        {
                            writer.Write(e.Stream.ToArray());
                        }
                    }
                }));

                // Start the connection
                var speakSchema = new SpeakSchema()
                {
                    Encoding = "linear16",
                    SampleRate = 48000,
                };
                bool bConnected = await speakClient.Connect(speakSchema);
                if (!bConnected)
                {
                    Console.WriteLine("Failed to connect to the server");
                    return;
                }

                // Send some Text to convert to audio
                speakClient.SpeakWithText("Hello World!");

                //Flush the audio
                speakClient.Flush();

                // Wait for the user to press a key
                Console.WriteLine("\n\nPress any key to stop and exit...\n\n\n");
                Console.ReadKey();

                // Stop the connection
                await speakClient.Stop();

                // Terminate Libraries
                Library.Terminate();
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Exception: {ex.Message}");
            }
        }
    }
}

📘

To learn more, check out our audio format tips for websockets in the TTS Chunking for Optimization Guide and our Audio Format Combinations that we offer.

Text-to-Speech Workflow

Below is a high-level workflow for obtaining an audio stream from user-provided text.

Establish a WebSocket Connection

To establish a connection, you must provide a few parameters on the URL to describe the type of audio you want. You can visit the API Ref (TODO: Link) to set the audio model, which controls the voice, the encoding, and the sample rate of the audio.

Sending Text and Retrieving Audio

Send the desired text to transform to audio using the WebSocket message below:

{
  "type": "Speak",
  "text": "Your text to transform to speech",
}

When you have queued enough text, you can obtain the corresponding audio by sending a Flush command.

{
  "type": "Flush"
}

Upon successfully sending the Flush, you will receive an audio byte stream from the websocket connection containing the synthesized text-to-speech. The format will be based on the encoding values provided upon establishing the connection.

Closing the Connection

When you are finished with the WebSocket, you can close the connection by sending the following Close command.

{
  "type": "Close"
}

Limits

Keep these limits in mind when making a Deepgram text-to-speech request.

Use One WebSocket per Conversation

If you are building for conversational AI use cases where a human is talking to a TTS agent, a single websocket per conversation is required. After you establish a connection, you will not be able to change the voice or media output settings.

Character Limits

The input limit is currently 2000 characters for the text input of each Speak message. If the string length sent as the text payload is 2001 characters or more, you will receive an error, and the audio file will not be created.

Character Throughput Limits

The throughput limit is 12k characters / 2 minutes and is measured by the number of characters sent to the websocket.

Timeout Limits

An active websocket has a 60-minute timeout period from the initial connection. This timeout exists for connections that are actively being used. If you desire a connection for longer than 60 minutes, create a new websocket connection to Deepgram.

Flush Message Limits

The maximum number of times you can send the Flush message is 20 times every 60 seconds. After that, you will receive a warning message stating that we cannot process any more flush messages until the 60-second time window has passed.

Rate Limits

📘

The current rate limit per project for Pay As You Go is 40 concurrent connections and for Growth plans 80 concurrent connections. Learn more about API Rate Limits.

What's Next?

Now that you've transformed text into speech with Deepgram's API, enhance your knowledge by exploring the following areas.

Read the Feature Guides

Deepgram's features help you customize your request to produce the best output for your use case. Here are a few guides that can help:

Build your own End-to-End Deepgram Conversational Demo with Twilio

You can get started with building a simple conversational demo using Twilio and Deepgram streaming transcription and text-to-speech WebSockets by checking out our Twilio Example with STT + TTS Streaming WS

🌈

We'd love to get your feedback on Deepgram's Aura text-to-speech. You will receive $50 in additional console credits within two weeks after filling out the form, and you may be invited to join a group of users with access to the latest private releases. To fill out the form, Click Here.