Text-to-Speech Streaming

An overview of the Deepgram Go SDK and Deepgram streaming text-to-speech.

Installing the SDK

To begin using Deepgram’s Text-to-Speech functionality, you need to install the Deepgram Go SDK in your existing project. You can do this using the following command:

Bash
$# Install the Deepgram Go SDK
># https://github.com/deepgram/deepgram-gpo-sdk
>
>go get github.com/deepgram/deepgram-go-sdk

Make a Deepgram Text-to-Speech Request

Go
1package main
2
3import (
4 "context"
5 "fmt"
6 "os"
7 "strings"
8 "sync"
9 "time"
10
11 msginterfaces "github.com/deepgram/deepgram-go-sdk/pkg/api/speak/v1/websocket/interfaces"
12 interfaces "github.com/deepgram/deepgram-go-sdk/pkg/client/interfaces/v1"
13 speak "github.com/deepgram/deepgram-go-sdk/pkg/client/speak"
14)
15
16const (
17 TTS_TEXT = "Hello, this is a text to speech example using Deepgram."
18 AUDIO_FILE = "output.wav"
19)
20
21type MyHandler struct {
22 binaryChan chan *[]byte
23 openChan chan *msginterfaces.OpenResponse
24 metadataChan chan *msginterfaces.MetadataResponse
25 flushChan chan *msginterfaces.FlushedResponse
26 clearChan chan *msginterfaces.ClearedResponse
27 closeChan chan *msginterfaces.CloseResponse
28 warningChan chan *msginterfaces.WarningResponse
29 errorChan chan *msginterfaces.ErrorResponse
30 unhandledChan chan *[]byte
31}
32
33func NewMyHandler() MyHandler {
34 handler := MyHandler{
35 binaryChan: make(chan *[]byte),
36 openChan: make(chan *msginterfaces.OpenResponse),
37 metadataChan: make(chan *msginterfaces.MetadataResponse),
38 flushChan: make(chan *msginterfaces.FlushedResponse),
39 clearChan: make(chan *msginterfaces.ClearedResponse),
40 closeChan: make(chan *msginterfaces.CloseResponse),
41 warningChan: make(chan *msginterfaces.WarningResponse),
42 errorChan: make(chan *msginterfaces.ErrorResponse),
43 unhandledChan: make(chan *[]byte),
44 }
45
46 go func() {
47 handler.Run()
48 }()
49
50 return handler
51}
52
53// GetUnhandled returns the binary event channels
54func (dch MyHandler) GetBinary() []*chan *[]byte {
55 return []*chan *[]byte{&dch.binaryChan}
56}
57
58// GetOpen returns the open channels
59func (dch MyHandler) GetOpen() []*chan *msginterfaces.OpenResponse {
60 return []*chan *msginterfaces.OpenResponse{&dch.openChan}
61}
62
63// GetMetadata returns the metadata channels
64func (dch MyHandler) GetMetadata() []*chan *msginterfaces.MetadataResponse {
65 return []*chan *msginterfaces.MetadataResponse{&dch.metadataChan}
66}
67
68// GetFlushed returns the flush channels
69func (dch MyHandler) GetFlush() []*chan *msginterfaces.FlushedResponse {
70 return []*chan *msginterfaces.FlushedResponse{&dch.flushChan}
71}
72
73// GetCleared returns the clear channels
74func (dch MyHandler) GetClear() []*chan *msginterfaces.ClearedResponse {
75 return []*chan *msginterfaces.ClearedResponse{&dch.clearChan}
76}
77
78// GetClose returns the close channels
79func (dch MyHandler) GetClose() []*chan *msginterfaces.CloseResponse {
80 return []*chan *msginterfaces.CloseResponse{&dch.closeChan}
81}
82
83// GetWarning returns the warning channels
84func (dch MyHandler) GetWarning() []*chan *msginterfaces.WarningResponse {
85 return []*chan *msginterfaces.WarningResponse{&dch.warningChan}
86}
87
88// GetError returns the error channels
89func (dch MyHandler) GetError() []*chan *msginterfaces.ErrorResponse {
90 return []*chan *msginterfaces.ErrorResponse{&dch.errorChan}
91}
92
93// GetUnhandled returns the unhandled event channels
94func (dch MyHandler) GetUnhandled() []*chan *[]byte {
95 return []*chan *[]byte{&dch.unhandledChan}
96}
97
98// Open is the callback for when the connection opens
99// golintci: funlen
100func (dch MyHandler) Run() error {
101 wgReceivers := sync.WaitGroup{}
102
103 // open channel
104 wgReceivers.Add(1)
105 go func() {
106 defer wgReceivers.Done()
107
108 for _ = range dch.openChan {
109 fmt.Printf("\n\n[OpenResponse]\n\n")
110 }
111 }()
112
113 // binary channel
114 wgReceivers.Add(1)
115 go func() {
116 defer wgReceivers.Done()
117
118 for br := range dch.binaryChan {
119 fmt.Printf("\n\n[Binary Data]\n")
120
121 file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
122 if err != nil {
123 fmt.Printf("Failed to open file. Err: %v\n", err)
124 continue
125 }
126
127 _, err = file.Write(*br)
128 file.Close()
129
130 if err != nil {
131 fmt.Printf("Failed to write to file. Err: %v\n", err)
132 continue
133 }
134 }
135 }()
136
137 // metadata channel
138 wgReceivers.Add(1)
139 go func() {
140 defer wgReceivers.Done()
141
142 for mr := range dch.metadataChan {
143 fmt.Printf("\n[FlushedResponse]\n")
144 fmt.Printf("RequestID: %s\n", strings.TrimSpace(mr.RequestID))
145 }
146 }()
147
148 // flushed channel
149 wgReceivers.Add(1)
150 go func() {
151 defer wgReceivers.Done()
152
153 for _ = range dch.flushChan {
154 fmt.Printf("\n[FlushedResponse]\n")
155 }
156 }()
157
158 // cleared channel
159 wgReceivers.Add(1)
160 go func() {
161 defer wgReceivers.Done()
162
163 for _ = range dch.clearChan {
164 fmt.Printf("\n[ClearedResponse]\n")
165 }
166 }()
167
168 // close channel
169 wgReceivers.Add(1)
170 go func() {
171 defer wgReceivers.Done()
172
173 for _ = range dch.closeChan {
174 fmt.Printf("\n\n[CloseResponse]\n\n")
175 }
176 }()
177
178 // warning channel
179 wgReceivers.Add(1)
180 go func() {
181 defer wgReceivers.Done()
182
183 for er := range dch.warningChan {
184 fmt.Printf("\n[WarningResponse]\n")
185 fmt.Printf("\nWarning.Type: %s\n", er.WarnCode)
186 fmt.Printf("Warning.Message: %s\n", er.WarnMsg)
187 fmt.Printf("Warning.Description: %s\n\n", er.Description)
188 fmt.Printf("Warning.Variant: %s\n\n", er.Variant)
189 }
190 }()
191
192 // error channel
193 wgReceivers.Add(1)
194 go func() {
195 defer wgReceivers.Done()
196
197 for er := range dch.errorChan {
198 fmt.Printf("\n[ErrorResponse]\n")
199 fmt.Printf("\nError.Type: %s\n", er.ErrCode)
200 fmt.Printf("Error.Message: %s\n", er.ErrMsg)
201 fmt.Printf("Error.Description: %s\n\n", er.Description)
202 fmt.Printf("Error.Variant: %s\n\n", er.Variant)
203 }
204 }()
205
206 // unhandled event channel
207 wgReceivers.Add(1)
208 go func() {
209 defer wgReceivers.Done()
210
211 for byData := range dch.unhandledChan {
212 fmt.Printf("\n[UnhandledEvent]")
213 fmt.Printf("Dump:\n%s\n\n", string(*byData))
214 }
215 }()
216
217 // wait for all receivers to finish
218 wgReceivers.Wait()
219
220 return nil
221}
222
223func main() {
224 // init library
225 speak.Init(speak.InitLib{
226 LogLevel: speak.LogLevelDefault, // LogLevelDefault, LogLevelFull, LogLevelDebug, LogLevelTrace
227 })
228
229 // Go context
230 ctx := context.Background()
231
232 // set the Client options
233 cOptions := &interfaces.ClientOptions{
234 // AutoFlushSpeakDelta: 1000,
235 }
236
237 // set the TTS options
238 ttsOptions := &interfaces.WSSpeakOptions{
239 Model: "aura-asteria-en",
240 Encoding: "linear16",
241 SampleRate: 48000,
242 }
243
244 // create the callback
245 callback := NewMyHandler()
246
247 // create a new stream using the NewStream function
248 dgClient, err := speak.NewWSUsingChan(ctx, "", cOptions, ttsOptions, callback)
249 if err != nil {
250 fmt.Println("ERROR creating TTS connection:", err)
251 return
252 }
253
254 // connect the websocket to Deepgram
255 bConnected := dgClient.Connect()
256 if !bConnected {
257 fmt.Println("Client.Connect failed")
258 os.Exit(1)
259 }
260
261 file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
262 if err != nil {
263 fmt.Printf("Failed to open file. Err: %v\n", err)
264 return
265 }
266 // Add a wav audio container header to the file if you want to play the audio
267 // using a media player like VLC, Media Player, or Apple Music
268 header := []byte{
269 0x52, 0x49, 0x46, 0x46, // "RIFF"
270 0x00, 0x00, 0x00, 0x00, // Placeholder for file size
271 0x57, 0x41, 0x56, 0x45, // "WAVE"
272 0x66, 0x6d, 0x74, 0x20, // "fmt "
273 0x10, 0x00, 0x00, 0x00, // Chunk size (16)
274 0x01, 0x00, // Audio format (1 for PCM)
275 0x01, 0x00, // Number of channels (1)
276 0x80, 0xbb, 0x00, 0x00, // Sample rate (48000)
277 0x00, 0xee, 0x02, 0x00, // Byte rate (48000 * 2)
278 0x02, 0x00, // Block align (2)
279 0x10, 0x00, // Bits per sample (16)
280 0x64, 0x61, 0x74, 0x61, // "data"
281 0x00, 0x00, 0x00, 0x00, // Placeholder for data size
282 }
283
284 _, err = file.Write(header)
285 if err != nil {
286 fmt.Printf("Failed to write header to file. Err: %v\n", err)
287 return
288 }
289 file.Close()
290
291 // Send the text input
292 err = dgClient.SpeakWithText(TTS_TEXT)
293 if err != nil {
294 fmt.Printf("Error sending text input: %v\n", err)
295 return
296 }
297
298 // If AutoFlushSpeakDelta is not set, you Flush the text input manually
299 err = dgClient.Flush()
300 if err != nil {
301 fmt.Printf("Error sending text input: %v\n", err)
302 return
303 }
304
305 // wait for user input to exit
306 time.Sleep(5 * time.Second)
307
308 // close the connection
309 dgClient.Stop()
310
311 fmt.Printf("Program exiting...\n")
312}

Audio Output Streaming

The audio bytes representing the converted text will stream or be passed to the client via the above AudioData event using the callback function.

It should be noted that these audio bytes are:

  • Container-less audio. Meaning depending on the encoding value chosen, only the raw audio data is sent. As an example, if you choose linear16 as your encoding for audio, a WAV header will not be sent. Please see the Tips and Tricks for more information.
  • Not of standard size/length when received by the client. This is because the text is broken down into sounds representing the speech. Certain sounds chained together to form fragments of spoken words are different in length and content.

Depending on what the use case is for the generated audio bytes, please visit one of these guides to better help utilize these audio bytes for your use case:

Where to Find Additional Examples

The SDK repository contains a good collection of text-to-speech examples, and the README contains links to them.

Go Channel-based Examples:

Legacy Go Callback-based Examples

Built with