Text-to-Speech Streaming

An overview of the Deepgram Go SDK and Deepgram streaming text-to-speech.

Installing the SDK

To begin using Deepgram's Text-to-Speech functionality, you need to install the Deepgram Go SDK in your existing project. You can do this using the following command:

# Install the Deepgram Go SDK
# https://github.com/deepgram/deepgram-gpo-sdk

go get github.com/deepgram/deepgram-go-sdk

Make a Deepgram Text-to-Speech Request

package main

import (
	"context"
	"fmt"
	"os"
	"strings"
	"sync"
	"time"

	msginterfaces "github.com/deepgram/deepgram-go-sdk/pkg/api/speak/v1/websocket/interfaces"
	interfaces "github.com/deepgram/deepgram-go-sdk/pkg/client/interfaces/v1"
	speak "github.com/deepgram/deepgram-go-sdk/pkg/client/speak"
)

const (
	TTS_TEXT   = "Hello, this is a text to speech example using Deepgram."
	AUDIO_FILE = "output.wav"
)

type MyHandler struct {
	binaryChan    chan *[]byte
	openChan      chan *msginterfaces.OpenResponse
	metadataChan  chan *msginterfaces.MetadataResponse
	flushChan     chan *msginterfaces.FlushedResponse
	clearChan     chan *msginterfaces.ClearedResponse
	closeChan     chan *msginterfaces.CloseResponse
	warningChan   chan *msginterfaces.WarningResponse
	errorChan     chan *msginterfaces.ErrorResponse
	unhandledChan chan *[]byte
}

func NewMyHandler() MyHandler {
	handler := MyHandler{
		binaryChan:    make(chan *[]byte),
		openChan:      make(chan *msginterfaces.OpenResponse),
		metadataChan:  make(chan *msginterfaces.MetadataResponse),
		flushChan:     make(chan *msginterfaces.FlushedResponse),
		clearChan:     make(chan *msginterfaces.ClearedResponse),
		closeChan:     make(chan *msginterfaces.CloseResponse),
		warningChan:   make(chan *msginterfaces.WarningResponse),
		errorChan:     make(chan *msginterfaces.ErrorResponse),
		unhandledChan: make(chan *[]byte),
	}

	go func() {
		handler.Run()
	}()

	return handler
}

// GetUnhandled returns the binary event channels
func (dch MyHandler) GetBinary() []*chan *[]byte {
	return []*chan *[]byte{&dch.binaryChan}
}

// GetOpen returns the open channels
func (dch MyHandler) GetOpen() []*chan *msginterfaces.OpenResponse {
	return []*chan *msginterfaces.OpenResponse{&dch.openChan}
}

// GetMetadata returns the metadata channels
func (dch MyHandler) GetMetadata() []*chan *msginterfaces.MetadataResponse {
	return []*chan *msginterfaces.MetadataResponse{&dch.metadataChan}
}

// GetFlushed returns the flush channels
func (dch MyHandler) GetFlush() []*chan *msginterfaces.FlushedResponse {
	return []*chan *msginterfaces.FlushedResponse{&dch.flushChan}
}

// GetCleared returns the clear channels
func (dch MyHandler) GetClear() []*chan *msginterfaces.ClearedResponse {
	return []*chan *msginterfaces.ClearedResponse{&dch.clearChan}
}

// GetClose returns the close channels
func (dch MyHandler) GetClose() []*chan *msginterfaces.CloseResponse {
	return []*chan *msginterfaces.CloseResponse{&dch.closeChan}
}

// GetWarning returns the warning channels
func (dch MyHandler) GetWarning() []*chan *msginterfaces.WarningResponse {
	return []*chan *msginterfaces.WarningResponse{&dch.warningChan}
}

// GetError returns the error channels
func (dch MyHandler) GetError() []*chan *msginterfaces.ErrorResponse {
	return []*chan *msginterfaces.ErrorResponse{&dch.errorChan}
}

// GetUnhandled returns the unhandled event channels
func (dch MyHandler) GetUnhandled() []*chan *[]byte {
	return []*chan *[]byte{&dch.unhandledChan}
}

// Open is the callback for when the connection opens
// golintci: funlen
func (dch MyHandler) Run() error {
	wgReceivers := sync.WaitGroup{}

	// open channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for _ = range dch.openChan {
			fmt.Printf("\n\n[OpenResponse]\n\n")
		}
	}()

	// binary channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for br := range dch.binaryChan {
			fmt.Printf("\n\n[Binary Data]\n")

			file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
			if err != nil {
				fmt.Printf("Failed to open file. Err: %v\n", err)
				continue
			}

			_, err = file.Write(*br)
			file.Close()

			if err != nil {
				fmt.Printf("Failed to write to file. Err: %v\n", err)
				continue
			}
		}
	}()

	// metadata channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for mr := range dch.metadataChan {
			fmt.Printf("\n[FlushedResponse]\n")
			fmt.Printf("RequestID: %s\n", strings.TrimSpace(mr.RequestID))
		}
	}()

	// flushed channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for _ = range dch.flushChan {
			fmt.Printf("\n[FlushedResponse]\n")
		}
	}()

	// cleared channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for _ = range dch.clearChan {
			fmt.Printf("\n[ClearedResponse]\n")
		}
	}()

	// close channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for _ = range dch.closeChan {
			fmt.Printf("\n\n[CloseResponse]\n\n")
		}
	}()

	// warning channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for er := range dch.warningChan {
			fmt.Printf("\n[WarningResponse]\n")
			fmt.Printf("\nWarning.Type: %s\n", er.WarnCode)
			fmt.Printf("Warning.Message: %s\n", er.WarnMsg)
			fmt.Printf("Warning.Description: %s\n\n", er.Description)
			fmt.Printf("Warning.Variant: %s\n\n", er.Variant)
		}
	}()

	// error channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for er := range dch.errorChan {
			fmt.Printf("\n[ErrorResponse]\n")
			fmt.Printf("\nError.Type: %s\n", er.ErrCode)
			fmt.Printf("Error.Message: %s\n", er.ErrMsg)
			fmt.Printf("Error.Description: %s\n\n", er.Description)
			fmt.Printf("Error.Variant: %s\n\n", er.Variant)
		}
	}()

	// unhandled event channel
	wgReceivers.Add(1)
	go func() {
		defer wgReceivers.Done()

		for byData := range dch.unhandledChan {
			fmt.Printf("\n[UnhandledEvent]")
			fmt.Printf("Dump:\n%s\n\n", string(*byData))
		}
	}()

	// wait for all receivers to finish
	wgReceivers.Wait()

	return nil
}

func main() {
	// init library
	speak.Init(speak.InitLib{
		LogLevel: speak.LogLevelDefault, // LogLevelDefault, LogLevelFull, LogLevelDebug, LogLevelTrace
	})

	// Go context
	ctx := context.Background()

	// set the Client options
	cOptions := &interfaces.ClientOptions{
		// AutoFlushSpeakDelta: 1000,
	}

	// set the TTS options
	ttsOptions := &interfaces.WSSpeakOptions{
		Model:      "aura-asteria-en",
		Encoding:   "linear16",
		SampleRate: 48000,
	}

	// create the callback
	callback := NewMyHandler()

	// create a new stream using the NewStream function
	dgClient, err := speak.NewWSUsingChan(ctx, "", cOptions, ttsOptions, callback)
	if err != nil {
		fmt.Println("ERROR creating TTS connection:", err)
		return
	}

	// connect the websocket to Deepgram
	bConnected := dgClient.Connect()
	if !bConnected {
		fmt.Println("Client.Connect failed")
		os.Exit(1)
	}

	file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
	if err != nil {
		fmt.Printf("Failed to open file. Err: %v\n", err)
		return
	}
	// Add a wav audio container header to the file if you want to play the audio
	// using a media player like VLC, Media Player, or Apple Music
	header := []byte{
		0x52, 0x49, 0x46, 0x46, // "RIFF"
		0x00, 0x00, 0x00, 0x00, // Placeholder for file size
		0x57, 0x41, 0x56, 0x45, // "WAVE"
		0x66, 0x6d, 0x74, 0x20, // "fmt "
		0x10, 0x00, 0x00, 0x00, // Chunk size (16)
		0x01, 0x00, // Audio format (1 for PCM)
		0x01, 0x00, // Number of channels (1)
		0x80, 0xbb, 0x00, 0x00, // Sample rate (48000)
		0x00, 0xee, 0x02, 0x00, // Byte rate (48000 * 2)
		0x02, 0x00, // Block align (2)
		0x10, 0x00, // Bits per sample (16)
		0x64, 0x61, 0x74, 0x61, // "data"
		0x00, 0x00, 0x00, 0x00, // Placeholder for data size
	}

	_, err = file.Write(header)
	if err != nil {
		fmt.Printf("Failed to write header to file. Err: %v\n", err)
		return
	}
	file.Close()

	// Send the text input
	err = dgClient.SpeakWithText(TTS_TEXT)
	if err != nil {
		fmt.Printf("Error sending text input: %v\n", err)
		return
	}

	// If AutoFlushSpeakDelta is not set, you Flush the text input manually
	err = dgClient.Flush()
	if err != nil {
		fmt.Printf("Error sending text input: %v\n", err)
		return
	}

	// wait for user input to exit
	time.Sleep(5 * time.Second)

	// close the connection
	dgClient.Stop()

	fmt.Printf("Program exiting...\n")
}

Audio Output Streaming

The audio bytes representing the converted text will stream or be passed to the client via the above AudioData event using the callback function.

It should be noted that these audio bytes are:

  • Container-less audio. Meaning depending on the encoding value chosen, only the raw audio data is sent. As an example, if you choose linear16 as your encoding for audio, a WAV header will not be sent. Please see the Tips and Tricks for more information.
  • Not of standard size/length when received by the client. This is because the text is broken down into sounds representing the speech. Certain sounds chained together to form fragments of spoken words are different in length and content.

Depending on what the use case is for the generated audio bytes, please visit one of these guides to better help utilize these audio bytes for your use case:

Where to Find Additional Examples

The SDK repository contains a good collection of text-to-speech examples, and the README contains links to them.

Go Channel-based Examples:

Legacy Go Callback-based Examples