Text-to-Speech Streaming — Deepgram

Aura-2 is currently available for the TTS REST API only. Websocket support is coming soon.

Installing the SDK

To begin using Deepgram’s Text-to-Speech functionality, you need to install the Deepgram Go SDK in your existing project. You can do this using the following command:

Bash

$ # Install the Deepgram Go SDK
> # https://github.com/deepgram/deepgram-gpo-sdk
> 
> go get github.com/deepgram/deepgram-go-sdk

Make a Deepgram Text-to-Speech Request

1 package main
2 
3 import (
4 	"context"
5 	"fmt"
6 	"os"
7 	"strings"
8 	"sync"
9 	"time"
10 
11 	msginterfaces "github.com/deepgram/deepgram-go-sdk/pkg/api/speak/v1/websocket/interfaces"
12 	interfaces "github.com/deepgram/deepgram-go-sdk/pkg/client/interfaces/v1"
13 	speak "github.com/deepgram/deepgram-go-sdk/pkg/client/speak"
14 )
15 
16 const (
17 	TTS_TEXT   = "Hello, this is a text to speech example using Deepgram."
18 	AUDIO_FILE = "output.wav"
19 )
20 
21 type MyHandler struct {
22 	binaryChan    chan *[]byte
23 	openChan      chan *msginterfaces.OpenResponse
24 	metadataChan  chan *msginterfaces.MetadataResponse
25 	flushChan     chan *msginterfaces.FlushedResponse
26 	clearChan     chan *msginterfaces.ClearedResponse
27 	closeChan     chan *msginterfaces.CloseResponse
28 	warningChan   chan *msginterfaces.WarningResponse
29 	errorChan     chan *msginterfaces.ErrorResponse
30 	unhandledChan chan *[]byte
31 }
32 
33 func NewMyHandler() MyHandler {
34 	handler := MyHandler{
35 		binaryChan:    make(chan *[]byte),
36 		openChan:      make(chan *msginterfaces.OpenResponse),
37 		metadataChan:  make(chan *msginterfaces.MetadataResponse),
38 		flushChan:     make(chan *msginterfaces.FlushedResponse),
39 		clearChan:     make(chan *msginterfaces.ClearedResponse),
40 		closeChan:     make(chan *msginterfaces.CloseResponse),
41 		warningChan:   make(chan *msginterfaces.WarningResponse),
42 		errorChan:     make(chan *msginterfaces.ErrorResponse),
43 		unhandledChan: make(chan *[]byte),
44 	}
45 
46 	go func() {
47 		handler.Run()
48 	}()
49 
50 	return handler
51 }
52 
53 // GetUnhandled returns the binary event channels
54 func (dch MyHandler) GetBinary() []*chan *[]byte {
55 	return []*chan *[]byte{&dch.binaryChan}
56 }
57 
58 // GetOpen returns the open channels
59 func (dch MyHandler) GetOpen() []*chan *msginterfaces.OpenResponse {
60 	return []*chan *msginterfaces.OpenResponse{&dch.openChan}
61 }
62 
63 // GetMetadata returns the metadata channels
64 func (dch MyHandler) GetMetadata() []*chan *msginterfaces.MetadataResponse {
65 	return []*chan *msginterfaces.MetadataResponse{&dch.metadataChan}
66 }
67 
68 // GetFlushed returns the flush channels
69 func (dch MyHandler) GetFlush() []*chan *msginterfaces.FlushedResponse {
70 	return []*chan *msginterfaces.FlushedResponse{&dch.flushChan}
71 }
72 
73 // GetCleared returns the clear channels
74 func (dch MyHandler) GetClear() []*chan *msginterfaces.ClearedResponse {
75 	return []*chan *msginterfaces.ClearedResponse{&dch.clearChan}
76 }
77 
78 // GetClose returns the close channels
79 func (dch MyHandler) GetClose() []*chan *msginterfaces.CloseResponse {
80 	return []*chan *msginterfaces.CloseResponse{&dch.closeChan}
81 }
82 
83 // GetWarning returns the warning channels
84 func (dch MyHandler) GetWarning() []*chan *msginterfaces.WarningResponse {
85 	return []*chan *msginterfaces.WarningResponse{&dch.warningChan}
86 }
87 
88 // GetError returns the error channels
89 func (dch MyHandler) GetError() []*chan *msginterfaces.ErrorResponse {
90 	return []*chan *msginterfaces.ErrorResponse{&dch.errorChan}
91 }
92 
93 // GetUnhandled returns the unhandled event channels
94 func (dch MyHandler) GetUnhandled() []*chan *[]byte {
95 	return []*chan *[]byte{&dch.unhandledChan}
96 }
97 
98 // Open is the callback for when the connection opens
99 // golintci: funlen
100 func (dch MyHandler) Run() error {
101 	wgReceivers := sync.WaitGroup{}
102 
103 	// open channel
104 	wgReceivers.Add(1)
105 	go func() {
106 		defer wgReceivers.Done()
107 
108 		for _ = range dch.openChan {
109 			fmt.Printf("\n\n[OpenResponse]\n\n")
110 		}
111 	}()
112 
113 	// binary channel
114 	wgReceivers.Add(1)
115 	go func() {
116 		defer wgReceivers.Done()
117 
118 		for br := range dch.binaryChan {
119 			fmt.Printf("\n\n[Binary Data]\n")
120 
121 			file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
122 			if err != nil {
123 				fmt.Printf("Failed to open file. Err: %v\n", err)
124 				continue
125 			}
126 
127 			_, err = file.Write(*br)
128 			file.Close()
129 
130 			if err != nil {
131 				fmt.Printf("Failed to write to file. Err: %v\n", err)
132 				continue
133 			}
134 		}
135 	}()
136 
137 	// metadata channel
138 	wgReceivers.Add(1)
139 	go func() {
140 		defer wgReceivers.Done()
141 
142 		for mr := range dch.metadataChan {
143 			fmt.Printf("\n[FlushedResponse]\n")
144 			fmt.Printf("RequestID: %s\n", strings.TrimSpace(mr.RequestID))
145 		}
146 	}()
147 
148 	// flushed channel
149 	wgReceivers.Add(1)
150 	go func() {
151 		defer wgReceivers.Done()
152 
153 		for _ = range dch.flushChan {
154 			fmt.Printf("\n[FlushedResponse]\n")
155 		}
156 	}()
157 
158 	// cleared channel
159 	wgReceivers.Add(1)
160 	go func() {
161 		defer wgReceivers.Done()
162 
163 		for _ = range dch.clearChan {
164 			fmt.Printf("\n[ClearedResponse]\n")
165 		}
166 	}()
167 
168 	// close channel
169 	wgReceivers.Add(1)
170 	go func() {
171 		defer wgReceivers.Done()
172 
173 		for _ = range dch.closeChan {
174 			fmt.Printf("\n\n[CloseResponse]\n\n")
175 		}
176 	}()
177 
178 	// warning channel
179 	wgReceivers.Add(1)
180 	go func() {
181 		defer wgReceivers.Done()
182 
183 		for er := range dch.warningChan {
184 			fmt.Printf("\n[WarningResponse]\n")
185 			fmt.Printf("\nWarning.Type: %s\n", er.WarnCode)
186 			fmt.Printf("Warning.Message: %s\n", er.WarnMsg)
187 			fmt.Printf("Warning.Description: %s\n\n", er.Description)
188 			fmt.Printf("Warning.Variant: %s\n\n", er.Variant)
189 		}
190 	}()
191 
192 	// error channel
193 	wgReceivers.Add(1)
194 	go func() {
195 		defer wgReceivers.Done()
196 
197 		for er := range dch.errorChan {
198 			fmt.Printf("\n[ErrorResponse]\n")
199 			fmt.Printf("\nError.Type: %s\n", er.ErrCode)
200 			fmt.Printf("Error.Message: %s\n", er.ErrMsg)
201 			fmt.Printf("Error.Description: %s\n\n", er.Description)
202 			fmt.Printf("Error.Variant: %s\n\n", er.Variant)
203 		}
204 	}()
205 
206 	// unhandled event channel
207 	wgReceivers.Add(1)
208 	go func() {
209 		defer wgReceivers.Done()
210 
211 		for byData := range dch.unhandledChan {
212 			fmt.Printf("\n[UnhandledEvent]")
213 			fmt.Printf("Dump:\n%s\n\n", string(*byData))
214 		}
215 	}()
216 
217 	// wait for all receivers to finish
218 	wgReceivers.Wait()
219 
220 	return nil
221 }
222 
223 func main() {
224 	// init library
225 	speak.Init(speak.InitLib{
226 		LogLevel: speak.LogLevelDefault, // LogLevelDefault, LogLevelFull, LogLevelDebug, LogLevelTrace
227 	})
228 
229 	// Go context
230 	ctx := context.Background()
231 
232 	// set the Client options
233 	cOptions := &interfaces.ClientOptions{
234 		// AutoFlushSpeakDelta: 1000,
235 	}
236 
237 	// set the TTS options
238 	ttsOptions := &interfaces.WSSpeakOptions{
239 		Model:      "aura-asteria-en",
240 		Encoding:   "linear16",
241 		SampleRate: 48000,
242 	}
243 
244 	// create the callback
245 	callback := NewMyHandler()
246 
247 	// create a new stream using the NewStream function
248 	dgClient, err := speak.NewWSUsingChan(ctx, "", cOptions, ttsOptions, callback)
249 	if err != nil {
250 		fmt.Println("ERROR creating TTS connection:", err)
251 		return
252 	}
253 
254 	// connect the websocket to Deepgram
255 	bConnected := dgClient.Connect()
256 	if !bConnected {
257 		fmt.Println("Client.Connect failed")
258 		os.Exit(1)
259 	}
260 
261 	file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
262 	if err != nil {
263 		fmt.Printf("Failed to open file. Err: %v\n", err)
264 		return
265 	}
266 	// Add a wav audio container header to the file if you want to play the audio
267 	// using a media player like VLC, Media Player, or Apple Music
268 	header := []byte{
269 		0x52, 0x49, 0x46, 0x46, // "RIFF"
270 		0x00, 0x00, 0x00, 0x00, // Placeholder for file size
271 		0x57, 0x41, 0x56, 0x45, // "WAVE"
272 		0x66, 0x6d, 0x74, 0x20, // "fmt "
273 		0x10, 0x00, 0x00, 0x00, // Chunk size (16)
274 		0x01, 0x00, // Audio format (1 for PCM)
275 		0x01, 0x00, // Number of channels (1)
276 		0x80, 0xbb, 0x00, 0x00, // Sample rate (48000)
277 		0x00, 0xee, 0x02, 0x00, // Byte rate (48000 * 2)
278 		0x02, 0x00, // Block align (2)
279 		0x10, 0x00, // Bits per sample (16)
280 		0x64, 0x61, 0x74, 0x61, // "data"
281 		0x00, 0x00, 0x00, 0x00, // Placeholder for data size
282 	}
283 
284 	_, err = file.Write(header)
285 	if err != nil {
286 		fmt.Printf("Failed to write header to file. Err: %v\n", err)
287 		return
288 	}
289 	file.Close()
290 
291 	// Send the text input
292 	err = dgClient.SpeakWithText(TTS_TEXT)
293 	if err != nil {
294 		fmt.Printf("Error sending text input: %v\n", err)
295 		return
296 	}
297 
298 	// If AutoFlushSpeakDelta is not set, you Flush the text input manually
299 	err = dgClient.Flush()
300 	if err != nil {
301 		fmt.Printf("Error sending text input: %v\n", err)
302 		return
303 	}
304 
305 	// wait for user input to exit
306 	time.Sleep(5 * time.Second)
307 
308 	// close the connection
309 	dgClient.Stop()
310 
311 	fmt.Printf("Program exiting...\n")
312 }

Audio Output Streaming

The audio bytes representing the converted text will stream or be passed to the client via the above AudioData event using the callback function.

It should be noted that these audio bytes are:

Container-less audio. Meaning depending on the encoding value chosen, only the raw audio data is sent. As an example, if you choose linear16 as your encoding for audio, a WAV header will not be sent. Please see the Tips and Tricks for more information.
Not of standard size/length when received by the client. This is because the text is broken down into sounds representing the speech. Certain sounds chained together to form fragments of spoken words are different in length and content.

Depending on what the use case is for the generated audio bytes, please visit one of these guides to better help utilize these audio bytes for your use case:

Where to Find Additional Examples

The SDK repository contains a good collection of text-to-speech examples, and the README contains links to them.

Go Channel-based Examples:

Hello World - examples/text-to-speech/websocket/simple_channel
Interactive - examples/text-to-speech/websocket/interactive_channel

Legacy Go Callback-based Examples

Hello World - examples/text-to-speech/websocket/simple_callback
Interactive - examples/text-to-speech/websocket/interactive_callback

$	# Install the Deepgram Go SDK
>	# https://github.com/deepgram/deepgram-gpo-sdk
>
>	go get github.com/deepgram/deepgram-go-sdk

1	package main
2
3	import (
4	"context"
5	"fmt"
6	"os"
7	"strings"
8	"sync"
9	"time"
10
11	msginterfaces "github.com/deepgram/deepgram-go-sdk/pkg/api/speak/v1/websocket/interfaces"
12	interfaces "github.com/deepgram/deepgram-go-sdk/pkg/client/interfaces/v1"
13	speak "github.com/deepgram/deepgram-go-sdk/pkg/client/speak"
14	)
15
16	const (
17	TTS_TEXT = "Hello, this is a text to speech example using Deepgram."
18	AUDIO_FILE = "output.wav"
19	)
20
21	type MyHandler struct {
22	binaryChan chan *[]byte
23	openChan chan *msginterfaces.OpenResponse
24	metadataChan chan *msginterfaces.MetadataResponse
25	flushChan chan *msginterfaces.FlushedResponse
26	clearChan chan *msginterfaces.ClearedResponse
27	closeChan chan *msginterfaces.CloseResponse
28	warningChan chan *msginterfaces.WarningResponse
29	errorChan chan *msginterfaces.ErrorResponse
30	unhandledChan chan *[]byte
31	}
32
33	func NewMyHandler() MyHandler {
34	handler := MyHandler{
35	binaryChan: make(chan *[]byte),
36	openChan: make(chan *msginterfaces.OpenResponse),
37	metadataChan: make(chan *msginterfaces.MetadataResponse),
38	flushChan: make(chan *msginterfaces.FlushedResponse),
39	clearChan: make(chan *msginterfaces.ClearedResponse),
40	closeChan: make(chan *msginterfaces.CloseResponse),
41	warningChan: make(chan *msginterfaces.WarningResponse),
42	errorChan: make(chan *msginterfaces.ErrorResponse),
43	unhandledChan: make(chan *[]byte),
44	}
45
46	go func() {
47	handler.Run()
48	}()
49
50	return handler
51	}
52
53	// GetUnhandled returns the binary event channels
54	func (dch MyHandler) GetBinary() []chan []byte {
55	return []chan []byte{&dch.binaryChan}
56	}
57
58	// GetOpen returns the open channels
59	func (dch MyHandler) GetOpen() []chan msginterfaces.OpenResponse {
60	return []chan msginterfaces.OpenResponse{&dch.openChan}
61	}
62
63	// GetMetadata returns the metadata channels
64	func (dch MyHandler) GetMetadata() []chan msginterfaces.MetadataResponse {
65	return []chan msginterfaces.MetadataResponse{&dch.metadataChan}
66	}
67
68	// GetFlushed returns the flush channels
69	func (dch MyHandler) GetFlush() []chan msginterfaces.FlushedResponse {
70	return []chan msginterfaces.FlushedResponse{&dch.flushChan}
71	}
72
73	// GetCleared returns the clear channels
74	func (dch MyHandler) GetClear() []chan msginterfaces.ClearedResponse {
75	return []chan msginterfaces.ClearedResponse{&dch.clearChan}
76	}
77
78	// GetClose returns the close channels
79	func (dch MyHandler) GetClose() []chan msginterfaces.CloseResponse {
80	return []chan msginterfaces.CloseResponse{&dch.closeChan}
81	}
82
83	// GetWarning returns the warning channels
84	func (dch MyHandler) GetWarning() []chan msginterfaces.WarningResponse {
85	return []chan msginterfaces.WarningResponse{&dch.warningChan}
86	}
87
88	// GetError returns the error channels
89	func (dch MyHandler) GetError() []chan msginterfaces.ErrorResponse {
90	return []chan msginterfaces.ErrorResponse{&dch.errorChan}
91	}
92
93	// GetUnhandled returns the unhandled event channels
94	func (dch MyHandler) GetUnhandled() []chan []byte {
95	return []chan []byte{&dch.unhandledChan}
96	}
97
98	// Open is the callback for when the connection opens
99	// golintci: funlen
100	func (dch MyHandler) Run() error {
101	wgReceivers := sync.WaitGroup{}
102
103	// open channel
104	wgReceivers.Add(1)
105	go func() {
106	defer wgReceivers.Done()
107
108	for _ = range dch.openChan {
109	fmt.Printf("\n\n[OpenResponse]\n\n")
110	}
111	}()
112
113	// binary channel
114	wgReceivers.Add(1)
115	go func() {
116	defer wgReceivers.Done()
117
118	for br := range dch.binaryChan {
119	fmt.Printf("\n\n[Binary Data]\n")
120
121	file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND\|os.O_CREATE\|os.O_WRONLY, 0o666)
122	if err != nil {
123	fmt.Printf("Failed to open file. Err: %v\n", err)
124	continue
125	}
126
127	_, err = file.Write(*br)
128	file.Close()
129
130	if err != nil {
131	fmt.Printf("Failed to write to file. Err: %v\n", err)
132	continue
133	}
134	}
135	}()
136
137	// metadata channel
138	wgReceivers.Add(1)
139	go func() {
140	defer wgReceivers.Done()
141
142	for mr := range dch.metadataChan {
143	fmt.Printf("\n[FlushedResponse]\n")
144	fmt.Printf("RequestID: %s\n", strings.TrimSpace(mr.RequestID))
145	}
146	}()
147
148	// flushed channel
149	wgReceivers.Add(1)
150	go func() {
151	defer wgReceivers.Done()
152
153	for _ = range dch.flushChan {
154	fmt.Printf("\n[FlushedResponse]\n")
155	}
156	}()
157
158	// cleared channel
159	wgReceivers.Add(1)
160	go func() {
161	defer wgReceivers.Done()
162
163	for _ = range dch.clearChan {
164	fmt.Printf("\n[ClearedResponse]\n")
165	}
166	}()
167
168	// close channel
169	wgReceivers.Add(1)
170	go func() {
171	defer wgReceivers.Done()
172
173	for _ = range dch.closeChan {
174	fmt.Printf("\n\n[CloseResponse]\n\n")
175	}
176	}()
177
178	// warning channel
179	wgReceivers.Add(1)
180	go func() {
181	defer wgReceivers.Done()
182
183	for er := range dch.warningChan {
184	fmt.Printf("\n[WarningResponse]\n")
185	fmt.Printf("\nWarning.Type: %s\n", er.WarnCode)
186	fmt.Printf("Warning.Message: %s\n", er.WarnMsg)
187	fmt.Printf("Warning.Description: %s\n\n", er.Description)
188	fmt.Printf("Warning.Variant: %s\n\n", er.Variant)
189	}
190	}()
191
192	// error channel
193	wgReceivers.Add(1)
194	go func() {
195	defer wgReceivers.Done()
196
197	for er := range dch.errorChan {
198	fmt.Printf("\n[ErrorResponse]\n")
199	fmt.Printf("\nError.Type: %s\n", er.ErrCode)
200	fmt.Printf("Error.Message: %s\n", er.ErrMsg)
201	fmt.Printf("Error.Description: %s\n\n", er.Description)
202	fmt.Printf("Error.Variant: %s\n\n", er.Variant)
203	}
204	}()
205
206	// unhandled event channel
207	wgReceivers.Add(1)
208	go func() {
209	defer wgReceivers.Done()
210
211	for byData := range dch.unhandledChan {
212	fmt.Printf("\n[UnhandledEvent]")
213	fmt.Printf("Dump:\n%s\n\n", string(*byData))
214	}
215	}()
216
217	// wait for all receivers to finish
218	wgReceivers.Wait()
219
220	return nil
221	}
222
223	func main() {
224	// init library
225	speak.Init(speak.InitLib{
226	LogLevel: speak.LogLevelDefault, // LogLevelDefault, LogLevelFull, LogLevelDebug, LogLevelTrace
227	})
228
229	// Go context
230	ctx := context.Background()
231
232	// set the Client options
233	cOptions := &interfaces.ClientOptions{
234	// AutoFlushSpeakDelta: 1000,
235	}
236
237	// set the TTS options
238	ttsOptions := &interfaces.WSSpeakOptions{
239	Model: "aura-asteria-en",
240	Encoding: "linear16",
241	SampleRate: 48000,
242	}
243
244	// create the callback
245	callback := NewMyHandler()
246
247	// create a new stream using the NewStream function
248	dgClient, err := speak.NewWSUsingChan(ctx, "", cOptions, ttsOptions, callback)
249	if err != nil {
250	fmt.Println("ERROR creating TTS connection:", err)
251	return
252	}
253
254	// connect the websocket to Deepgram
255	bConnected := dgClient.Connect()
256	if !bConnected {
257	fmt.Println("Client.Connect failed")
258	os.Exit(1)
259	}
260
261	file, err := os.OpenFile(AUDIO_FILE, os.O_APPEND\|os.O_CREATE\|os.O_WRONLY, 0o666)
262	if err != nil {
263	fmt.Printf("Failed to open file. Err: %v\n", err)
264	return
265	}
266	// Add a wav audio container header to the file if you want to play the audio
267	// using a media player like VLC, Media Player, or Apple Music
268	header := []byte{
269	0x52, 0x49, 0x46, 0x46, // "RIFF"
270	0x00, 0x00, 0x00, 0x00, // Placeholder for file size
271	0x57, 0x41, 0x56, 0x45, // "WAVE"
272	0x66, 0x6d, 0x74, 0x20, // "fmt "
273	0x10, 0x00, 0x00, 0x00, // Chunk size (16)
274	0x01, 0x00, // Audio format (1 for PCM)
275	0x01, 0x00, // Number of channels (1)
276	0x80, 0xbb, 0x00, 0x00, // Sample rate (48000)
277	0x00, 0xee, 0x02, 0x00, // Byte rate (48000 * 2)
278	0x02, 0x00, // Block align (2)
279	0x10, 0x00, // Bits per sample (16)
280	0x64, 0x61, 0x74, 0x61, // "data"
281	0x00, 0x00, 0x00, 0x00, // Placeholder for data size
282	}
283
284	_, err = file.Write(header)
285	if err != nil {
286	fmt.Printf("Failed to write header to file. Err: %v\n", err)
287	return
288	}
289	file.Close()
290
291	// Send the text input
292	err = dgClient.SpeakWithText(TTS_TEXT)
293	if err != nil {
294	fmt.Printf("Error sending text input: %v\n", err)
295	return
296	}
297
298	// If AutoFlushSpeakDelta is not set, you Flush the text input manually
299	err = dgClient.Flush()
300	if err != nil {
301	fmt.Printf("Error sending text input: %v\n", err)
302	return
303	}
304
305	// wait for user input to exit
306	time.Sleep(5 * time.Second)
307
308	// close the connection
309	dgClient.Stop()
310
311	fmt.Printf("Program exiting...\n")
312	}