Text-to-Speech Streaming
An overview of the Deepgram Go SDK and Deepgram streaming text-to-speech.
Installing the SDK
To begin using Deepgram's Text-to-Speech functionality, you need to install the Deepgram Go SDK in your existing project. You can do this using the following command:
# Install the Deepgram Go SDK
# https://github.com/deepgram/deepgram-gpo-sdk
go get github.com/deepgram/deepgram-go-sdk
Make a Deepgram Text-to-Speech Request
package main
import (
"context"
"fmt"
"os"
"strings"
"sync"
"time"
msginterfaces "github.com/deepgram/deepgram-go-sdk/pkg/api/speak/v1/websocket/interfaces"
interfaces "github.com/deepgram/deepgram-go-sdk/pkg/client/interfaces/v1"
speak "github.com/deepgram/deepgram-go-sdk/pkg/client/speak"
)
const (
TTS_TEXT = "Hello, this is a text to speech example using Deepgram."
AUDIO_FILE = "output.wav"
)
type MyHandler struct {
binaryChan chan *[]byte
openChan chan *msginterfaces.OpenResponse
metadataChan chan *msginterfaces.MetadataResponse
flushChan chan *msginterfaces.FlushedResponse
clearChan chan *msginterfaces.ClearedResponse
closeChan chan *msginterfaces.CloseResponse
warningChan chan *msginterfaces.WarningResponse
errorChan chan *msginterfaces.ErrorResponse
unhandledChan chan *[]byte
}
func NewMyHandler() MyHandler {
handler := MyHandler{
binaryChan: make(chan *[]byte),
openChan: make(chan *msginterfaces.OpenResponse),
metadataChan: make(chan *msginterfaces.MetadataResponse),
flushChan: make(chan *msginterfaces.FlushedResponse),
clearChan: make(chan *msginterfaces.ClearedResponse),
closeChan: make(chan *msginterfaces.CloseResponse),
warningChan: make(chan *msginterfaces.WarningResponse),
errorChan: make(chan *msginterfaces.ErrorResponse),
unhandledChan: make(chan *[]byte),
}
go func() {
handler.Run()
}()
return handler
}
// GetUnhandled returns the binary event channels
func (dch MyHandler) GetBinary() []*chan *[]byte {
return []*chan *[]byte{&dch.binaryChan}
}
// GetOpen returns the open channels
func (dch MyHandler) GetOpen() []*chan *msginterfaces.OpenResponse {
return []*chan *msginterfaces.OpenResponse{&dch.openChan}
}
// GetMetadata returns the metadata channels
func (dch MyHandler) GetMetadata() []*chan *msginterfaces.MetadataResponse {
return []*chan *msginterfaces.MetadataResponse{&dch.metadataChan}
}
// GetFlushed returns the flush channels
func (dch MyHandler) GetFlush() []*chan *msginterfaces.FlushedResponse {
return []*chan *msginterfaces.FlushedResponse{&dch.flushChan}
}
// GetCleared returns the clear channels
func (dch MyHandler) GetClear() []*chan *msginterfaces.ClearedResponse {
return []*chan *msginterfaces.ClearedResponse{&dch.clearChan}
}
// GetClose returns the close channels
func (dch MyHandler) GetClose() []*chan *msginterfaces.CloseResponse {
return []*chan *msginterfaces.CloseResponse{&dch.closeChan}
}
// GetWarning returns the warning channels
func (dch MyHandler) GetWarning() []*chan *msginterfaces.WarningResponse {
return []*chan *msginterfaces.WarningResponse{&dch.warningChan}
}
// GetError returns the error channels
func (dch MyHandler) GetError() []*chan *msginterfaces.ErrorResponse {
return []*chan *msginterfaces.ErrorResponse{&dch.errorChan}
}
// GetUnhandled returns the unhandled event channels
func (dch MyHandler) GetUnhandled() []*chan *[]byte {
return []*chan *[]byte{&dch.unhandledChan}
}
// Open is the callback for when the connection opens
// golintci: funlen
func (dch MyHandler) Run() error {
wgReceivers := sync.WaitGroup{}
// open channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for _ = range dch.openChan {
fmt.Printf("\n\n[OpenResponse]\n\n")
}
}()
// binary channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for br := range dch.binaryChan {
fmt.Printf("\n\n[Binary Data]\n")
file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
if err != nil {
fmt.Printf("Failed to open file. Err: %v\n", err)
continue
}
_, err = file.Write(*br)
file.Close()
if err != nil {
fmt.Printf("Failed to write to file. Err: %v\n", err)
continue
}
}
}()
// metadata channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for mr := range dch.metadataChan {
fmt.Printf("\n[FlushedResponse]\n")
fmt.Printf("RequestID: %s\n", strings.TrimSpace(mr.RequestID))
}
}()
// flushed channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for _ = range dch.flushChan {
fmt.Printf("\n[FlushedResponse]\n")
}
}()
// cleared channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for _ = range dch.clearChan {
fmt.Printf("\n[ClearedResponse]\n")
}
}()
// close channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for _ = range dch.closeChan {
fmt.Printf("\n\n[CloseResponse]\n\n")
}
}()
// warning channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for er := range dch.warningChan {
fmt.Printf("\n[WarningResponse]\n")
fmt.Printf("\nWarning.Type: %s\n", er.WarnCode)
fmt.Printf("Warning.Message: %s\n", er.WarnMsg)
fmt.Printf("Warning.Description: %s\n\n", er.Description)
fmt.Printf("Warning.Variant: %s\n\n", er.Variant)
}
}()
// error channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for er := range dch.errorChan {
fmt.Printf("\n[ErrorResponse]\n")
fmt.Printf("\nError.Type: %s\n", er.ErrCode)
fmt.Printf("Error.Message: %s\n", er.ErrMsg)
fmt.Printf("Error.Description: %s\n\n", er.Description)
fmt.Printf("Error.Variant: %s\n\n", er.Variant)
}
}()
// unhandled event channel
wgReceivers.Add(1)
go func() {
defer wgReceivers.Done()
for byData := range dch.unhandledChan {
fmt.Printf("\n[UnhandledEvent]")
fmt.Printf("Dump:\n%s\n\n", string(*byData))
}
}()
// wait for all receivers to finish
wgReceivers.Wait()
return nil
}
func main() {
// init library
speak.Init(speak.InitLib{
LogLevel: speak.LogLevelDefault, // LogLevelDefault, LogLevelFull, LogLevelDebug, LogLevelTrace
})
// Go context
ctx := context.Background()
// set the Client options
cOptions := &interfaces.ClientOptions{
// AutoFlushSpeakDelta: 1000,
}
// set the TTS options
ttsOptions := &interfaces.WSSpeakOptions{
Model: "aura-asteria-en",
Encoding: "linear16",
SampleRate: 48000,
}
// create the callback
callback := NewMyHandler()
// create a new stream using the NewStream function
dgClient, err := speak.NewWSUsingChan(ctx, "", cOptions, ttsOptions, callback)
if err != nil {
fmt.Println("ERROR creating TTS connection:", err)
return
}
// connect the websocket to Deepgram
bConnected := dgClient.Connect()
if !bConnected {
fmt.Println("Client.Connect failed")
os.Exit(1)
}
file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
if err != nil {
fmt.Printf("Failed to open file. Err: %v\n", err)
return
}
// Add a wav audio container header to the file if you want to play the audio
// using a media player like VLC, Media Player, or Apple Music
header := []byte{
0x52, 0x49, 0x46, 0x46, // "RIFF"
0x00, 0x00, 0x00, 0x00, // Placeholder for file size
0x57, 0x41, 0x56, 0x45, // "WAVE"
0x66, 0x6d, 0x74, 0x20, // "fmt "
0x10, 0x00, 0x00, 0x00, // Chunk size (16)
0x01, 0x00, // Audio format (1 for PCM)
0x01, 0x00, // Number of channels (1)
0x80, 0xbb, 0x00, 0x00, // Sample rate (48000)
0x00, 0xee, 0x02, 0x00, // Byte rate (48000 * 2)
0x02, 0x00, // Block align (2)
0x10, 0x00, // Bits per sample (16)
0x64, 0x61, 0x74, 0x61, // "data"
0x00, 0x00, 0x00, 0x00, // Placeholder for data size
}
_, err = file.Write(header)
if err != nil {
fmt.Printf("Failed to write header to file. Err: %v\n", err)
return
}
file.Close()
// Send the text input
err = dgClient.SpeakWithText(TTS_TEXT)
if err != nil {
fmt.Printf("Error sending text input: %v\n", err)
return
}
// If AutoFlushSpeakDelta is not set, you Flush the text input manually
err = dgClient.Flush()
if err != nil {
fmt.Printf("Error sending text input: %v\n", err)
return
}
// wait for user input to exit
time.Sleep(5 * time.Second)
// close the connection
dgClient.Stop()
fmt.Printf("Program exiting...\n")
}
Audio Output Streaming
The audio bytes representing the converted text will stream or be passed to the client via the above AudioData
event using the callback function.
It should be noted that these audio bytes are:
- Container-less audio. Meaning depending on the
encoding
value chosen, only the raw audio data is sent. As an example, if you chooselinear16
as yourencoding
for audio, aWAV
header will not be sent. Please see the Tips and Tricks for more information. - Not of standard size/length when received by the client. This is because the text is broken down into sounds representing the speech. Certain sounds chained together to form fragments of spoken words are different in length and content.
Depending on what the use case is for the generated audio bytes, please visit one of these guides to better help utilize these audio bytes for your use case:
- Sending LLM Outputs to a WebSocket
- Text Chunking for Streaming TTS Optimization
- Handling Audio Issues in Text To Speech
Where to Find Additional Examples
The SDK repository contains a good collection of text-to-speech examples, and the README contains links to them.
Go Channel-based Examples:
- Hello World - examples/text-to-speech/websocket/simple_channel
- Interactive - examples/text-to-speech/websocket/interactive_channel
Legacy Go Callback-based Examples
- Hello World - examples/text-to-speech/websocket/simple_callback
- Interactive - examples/text-to-speech/websocket/interactive_callback
Updated about 2 months ago