Text to Speech Streaming — Deepgram

Aura-2 is currently available for the TTS REST API only. Websocket support is coming soon.

Installing the SDK

To begin using Deepgram’s Text-to-Speech functionality, you need to install the Deepgram .NET SDK in your existing project. You can do this using the following command:

1 # Install the Deepgram .NET SDK
2 # https://github.com/deepgram/deepgram-dotnet-sdk
3 
4 dotnet add package Deepgram
5 
6 # Optionally, install the Deepgram Microphone package
7 dotnet add package Deepgram.Microphone

Make a Deepgram Text-to-Speech Request

1 using System.Text;
2 
3 using Deepgram.Models.Authenticate.v1;
4 using Deepgram.Models.Speak.v1.WebSocket;
5 using Deepgram.Logger;
6 
7 namespace SampleApp
8 {
9     class Program
10     {
11         static async Task Main(string[] args)
12         {
13             // Initialize Library with default logging
14             Library.Initialize();
15 
16             // use the client factory with a API Key set with the "DEEPGRAM_API_KEY" environment variable
17             var speakClient = ClientFactory.CreateSpeakWebSocketClient();
18 
19             // Subscribe to the EventResponseReceived event
20             speakClient.Subscribe(new EventHandler<OpenResponse>((sender, e) =>
21             {
22                 Console.WriteLine($"\n\n----> {e.Type} received");
23             }));
24             speakClient.Subscribe(new EventHandler<MetadataResponse>((sender, e) =>
25             {
26                 Console.WriteLine($"----> {e.Type} received");
27                 Console.WriteLine($"----> RequestId: {e.RequestId}");
28             }));
29             speakClient.Subscribe(new EventHandler<AudioResponse>((sender, e) =>
30             {
31                 Console.WriteLine($"----> {e.Type} received");
32 
33                 if (e.Stream != null)
34                 {
35                     using (BinaryWriter writer = new BinaryWriter(File.Open("output.mp3", FileMode.Append)))
36                     {
37                         writer.Write(e.Stream.ToArray());
38                     }
39                 }
40             }));
41             speakClient.Subscribe(new EventHandler<FlushedResponse>((sender, e) =>
42             {
43                 Console.WriteLine($"----> {e.Type} received");
44             }));
45             speakClient.Subscribe(new EventHandler<ClearedResponse>((sender, e) =>
46             {
47                 Console.WriteLine($"----> {e.Type} received");
48             }));
49             speakClient.Subscribe(new EventHandler<CloseResponse>((sender, e) =>
50             {
51                 Console.WriteLine($"----> {e.Type} received");
52             }));
53             speakClient.Subscribe(new EventHandler<UnhandledResponse>((sender, e) =>
54             {
55                 Console.WriteLine($"----> {e.Type} received");
56             }));
57             speakClient.Subscribe(new EventHandler<WarningResponse>((sender, e) =>
58             {
59                 Console.WriteLine($"----> {e.Type} received");
60             }));
61             speakClient.Subscribe(new EventHandler<ErrorResponse>((sender, e) =>
62             {
63                 Console.WriteLine($"----> {e.Type} received. Error: {e.Message}");
64             }));
65 
66             // Start the connection
67             var speakSchema = new SpeakSchema();
68             await speakClient.Connect(speakSchema);
69 
70             // Send some Text to convert to audio
71             speakClient.SpeakWithText("Hello World!");
72 
73             //Flush the audio
74             speakClient.Flush();
75 
76             // Wait for the user to press a key
77             Console.WriteLine("\n\nPress any key to stop and exit...\n\n\n");
78             Console.ReadKey();
79 
80             // Stop the connection
81             await speakClient.Stop();
82 
83             // Terminate Libraries
84             Library.Terminate();
85         }
86     }
87 }

Audio Output Streaming

The audio bytes representing the converted text will stream or be passed to the client via the above AudioData event using the callback function.

It should be noted that these audio bytes are:

Container-less audio. Meaning depending on the encoding value chosen, only the raw audio data is sent. As an example, if you choose linear16 as your encoding for audio, a WAV header will not be sent. Please see the Tips and Tricks for more information.
Not of standard size/length when received by the client. This is because the text is broken down into sounds representing the speech. Certain sounds chained together to form fragments of spoken words are different in length and content.

Depending on what the use case is for the generated audio bytes, please visit one of these guides to better help utilize these audio bytes for your use case:

Where To Find Additional Examples

The SDK repository contains a good collection of text-to-speech examples, and the README contains links to them.

Some Example(s):

Example “Hello World” - examples/text-to-speech/websocket/simple

1	# Install the Deepgram .NET SDK
2	# https://github.com/deepgram/deepgram-dotnet-sdk
3
4	dotnet add package Deepgram
5
6	# Optionally, install the Deepgram Microphone package
7	dotnet add package Deepgram.Microphone

1	using System.Text;
2
3	using Deepgram.Models.Authenticate.v1;
4	using Deepgram.Models.Speak.v1.WebSocket;
5	using Deepgram.Logger;
6
7	namespace SampleApp
8	{
9	class Program
10	{
11	static async Task Main(string[] args)
12	{
13	// Initialize Library with default logging
14	Library.Initialize();
15
16	// use the client factory with a API Key set with the "DEEPGRAM_API_KEY" environment variable
17	var speakClient = ClientFactory.CreateSpeakWebSocketClient();
18
19	// Subscribe to the EventResponseReceived event
20	speakClient.Subscribe(new EventHandler<OpenResponse>((sender, e) =>
21	{
22	Console.WriteLine($"\n\n----> {e.Type} received");
23	}));
24	speakClient.Subscribe(new EventHandler<MetadataResponse>((sender, e) =>
25	{
26	Console.WriteLine($"----> {e.Type} received");
27	Console.WriteLine($"----> RequestId: {e.RequestId}");
28	}));
29	speakClient.Subscribe(new EventHandler<AudioResponse>((sender, e) =>
30	{
31	Console.WriteLine($"----> {e.Type} received");
32
33	if (e.Stream != null)
34	{
35	using (BinaryWriter writer = new BinaryWriter(File.Open("output.mp3", FileMode.Append)))
36	{
37	writer.Write(e.Stream.ToArray());
38	}
39	}
40	}));
41	speakClient.Subscribe(new EventHandler<FlushedResponse>((sender, e) =>
42	{
43	Console.WriteLine($"----> {e.Type} received");
44	}));
45	speakClient.Subscribe(new EventHandler<ClearedResponse>((sender, e) =>
46	{
47	Console.WriteLine($"----> {e.Type} received");
48	}));
49	speakClient.Subscribe(new EventHandler<CloseResponse>((sender, e) =>
50	{
51	Console.WriteLine($"----> {e.Type} received");
52	}));
53	speakClient.Subscribe(new EventHandler<UnhandledResponse>((sender, e) =>
54	{
55	Console.WriteLine($"----> {e.Type} received");
56	}));
57	speakClient.Subscribe(new EventHandler<WarningResponse>((sender, e) =>
58	{
59	Console.WriteLine($"----> {e.Type} received");
60	}));
61	speakClient.Subscribe(new EventHandler<ErrorResponse>((sender, e) =>
62	{
63	Console.WriteLine($"----> {e.Type} received. Error: {e.Message}");
64	}));
65
66	// Start the connection
67	var speakSchema = new SpeakSchema();
68	await speakClient.Connect(speakSchema);
69
70	// Send some Text to convert to audio
71	speakClient.SpeakWithText("Hello World!");
72
73	//Flush the audio
74	speakClient.Flush();
75
76	// Wait for the user to press a key
77	Console.WriteLine("\n\nPress any key to stop and exit...\n\n\n");
78	Console.ReadKey();
79
80	// Stop the connection
81	await speakClient.Stop();
82
83	// Terminate Libraries
84	Library.Terminate();
85	}
86	}
87	}