Audio Output Streaming
Start streaming your text-to-speech REST audio as soon as the first byte arrives.
Upon successful processing of a Deepgram text-to-speech request, you will receive an audio file containing the synthesized text-to-speech output.
Fortunately, you do not have to wait for the entire audio file to be available before playing the audio. As soon as the first byte arrives, the playback can begin, and the subsequent bytes can continue to be streamed in while the audio is already playing.
This guide will give you some tips and provide some examples so you can start streaming the audio as soon as you receive the first byte.
Implementation Examples
The following two examples demonstrate how to play the audio as soon as the first byte is returned. The first example takes a single text source and sends it to Deepgram for processing, while the second example chunks the text source by sentence boundaries and then consecutively sends each chunk to Deepgram for processing.
To see similar code examples that use Deepgram's SDKs, check out the output streaming branch of the text-to-speech Starter Apps.
Single Text Source Payload
import requests
DEEPGRAM_URL = "https://api.deepgram.com/v1/speak?model=aura-asteria-en"
DEEPGRAM_API_KEY = "DEEPGRAM_API_KEY"
payload = {
"text": "Hello, how can I help you today? My name is Emily and I'm very glad to meet you. What do you think of this new text-to-speech API?"
}
headers = {
"Authorization": f"Token {DEEPGRAM_API_KEY}",
"Content-Type": "application/json"
}
audio_file_path = "output.mp3" # Path to save the audio file
with open(audio_file_path, 'wb') as file_stream:
response = requests.post(DEEPGRAM_URL, headers=headers, json=payload, stream=True)
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file_stream.write(chunk) # Write each chunk of audio data to the file
print("Audio download complete")
const fs = require("fs");
const https = require("https");
const DEEPGRAM_URL = "https://api.deepgram.com/v1/speak?model=aura-asteria-en";
const DEEPGRAM_API_KEY = "DEEPGRAM_API_KEY";
const payload = JSON.stringify({
text: "Hello, how can I help you today? My name is Emily and I'm very glad to meet you. What do you think of this new text-to-speech API?",
});
const requestConfig = {
method: "POST",
headers: {
Authorization: `Token ${DEEPGRAM_API_KEY}`,
"Content-Type": "application/json",
},
};
const audioFilePath = "output.mp3"; // Path to save the audio file
const fileStream = fs.createWriteStream(audioFilePath); // Create a file stream to write the audio to
const req = https.request(DEEPGRAM_URL, requestConfig, (res) => {
res.on("data", (chunk) => {
fileStream.write(chunk); // Stream audio to the file
});
res.on("end", () => {
console.log("Audio download complete");
fileStream.end(); // Close the file stream once the response ends
});
});
req.on("error", (error) => {
console.error("Error:", error);
});
req.write(payload);
req.end();
package main
import (
"bytes"
"fmt"
"io"
"net/http"
"os"
)
const (
deepgramURL = "https://api.deepgram.com/v1/speak?model=aura-asteria-en"
deepgramAPIKey = "DEEPGRAM_API_KEY"
textPayload = `{"text": "Hello, how can I help you today? My name is Emily and I'm very glad to meet you. What do you think of this new text-to-speech API?"}`
outputFilename = "output.mp3"
)
func main() {
// Create or truncate the output file
file, err := os.Create(outputFilename)
if err != nil {
fmt.Println("Error creating file:", err)
return
}
defer file.Close()
// Make a POST request to Deepgram API and stream the response
req, err := http.NewRequest("POST", deepgramURL, bytes.NewBuffer([]byte(textPayload)))
if err != nil {
fmt.Println("Error creating request:", err)
return
}
req.Header.Set("Authorization", "Token "+deepgramAPIKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error making request:", err)
return
}
defer resp.Body.Close()
// Copy the response body to the output file as it arrives
_, err = io.Copy(file, resp.Body)
if err != nil {
fmt.Println("Error copying response body to file:", err)
return
}
fmt.Println("Audio streaming and writing completed.")
}
Chunked Text Source Payload
import re
import requests
DEEPGRAM_URL = 'https://api.deepgram.com/v1/speak?model=aura-helios-en'
headers = {
"Authorization": "Token DEEPGRAM_API_KEY",
"Content-Type": "application/json"
}
input_text = "Our story begins in a peaceful woodland kingdom where a lively squirrel named Frolic made his abode high up within a cedar tree's embrace. He was not a usual woodland creature, for he was blessed with an insatiable curiosity and a heart for adventure. Nearby, a glistening river snaked through the landscape, home to a wonder named Splash - a silver-scaled flying fish whose ability to break free from his water-haven intrigued the woodland onlookers. This magical world moved on a rhythm of its own until an unforeseen circumstance brought Frolic and Splash together. One radiant morning, while Frolic was on his regular excursion, and Splash was making his aerial tours, an unpredictable wave playfully tossed and misplaced Splash onto the riverbank. Despite his initial astonishment, Frolic hurriedly and kindly assisted his new friend back to his watery abode. Touched by Frolic's compassion, Splash expressed his gratitude by inviting his friend to share his world. As Splash perched on Frolic's back, he tasted of the forest's bounty, felt the sun’s rays filter through the colors of the trees, experienced the conversations amidst the woods, and while at it, taught the woodland how to blur the lines between earth and water."
def segment_text_by_sentence(text):
sentence_boundaries = re.finditer(r'(?<=[.!?])\s+', text)
boundaries_indices = [boundary.start() for boundary in sentence_boundaries]
segments = []
start = 0
for boundary_index in boundaries_indices:
segments.append(text[start:boundary_index + 1].strip())
start = boundary_index + 1
segments.append(text[start:].strip())
return segments
def synthesize_audio(text, output_file):
payload = {"text": text}
with requests.post(DEEPGRAM_URL, stream=True, headers=headers, json=payload) as r:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
output_file.write(chunk)
def main():
segments = segment_text_by_sentence(input_text)
# Create or truncate the output file
with open("output.mp3", "wb") as output_file:
for segment_text in segments:
synthesize_audio(segment_text, output_file)
print("Audio file creation completed.")
if __name__ == "__main__":
main()
const https = require("https");
const fs = require("fs");
const DEEPGRAM_URL = "https://api.deepgram.com/v1/speak?model=aura-helios-en";
const DEEPGRAM_API_KEY = "DEEPGRAM_API_KEY";
const inputText =
"Our story begins in a peaceful woodland kingdom where a lively squirrel named Frolic made his abode high up within a cedar tree's embrace. He was not a usual woodland creature, for he was blessed with an insatiable curiosity and a heart for adventure. Nearby, a glistening river snaked through the landscape, home to a wonder named Splash - a silver-scaled flying fish whose ability to break free from his water-haven intrigued the woodland onlookers. This magical world moved on a rhythm of its own until an unforeseen circumstance brought Frolic and Splash together. One radiant morning, while Frolic was on his regular excursion, and Splash was making his aerial tours, an unpredictable wave playfully tossed and misplaced Splash onto the riverbank. Despite his initial astonishment, Frolic hurriedly and kindly assisted his new friend back to his watery abode. Touched by Frolic's compassion, Splash expressed his gratitude by inviting his friend to share his world. As Splash perched on Frolic's back, he tasted of the forest's bounty, felt the sun’s rays filter through the colors of the trees, experienced the conversations amidst the woods, and while at it, taught the woodland how to blur the lines between earth and water.";
const outputFilename = "output.mp3";
function segmentTextBySentence(text) {
return text.match(/[^.!?]+[.!?]/g).map((sentence) => sentence.trim());
}
function synthesizeAudio(text) {
return new Promise((resolve, reject) => {
const payload = JSON.stringify({ text });
const options = {
method: "POST",
headers: {
Authorization: `Token ${DEEPGRAM_API_KEY}`,
"Content-Type": "application/json",
},
};
const req = https.request(DEEPGRAM_URL, options, (res) => {
let data = [];
res.on("data", (chunk) => {
data.push(chunk);
});
res.on("end", () => {
resolve(Buffer.concat(data));
});
});
req.on("error", (error) => {
reject(error);
});
req.write(payload);
req.end();
});
}
async function main() {
const segments = segmentTextBySentence(inputText);
// Create or truncate the output file
const outputFile = fs.createWriteStream(outputFilename);
for (const segment of segments) {
try {
const audioData = await synthesizeAudio(segment);
outputFile.write(audioData);
console.log("Audio stream finished for segment:", segment);
} catch (error) {
console.error("Error synthesizing audio:", error);
}
}
console.log("Audio file creation completed.");
}
main();
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"regexp"
)
const (
DeepgramURL = "https://api.deepgram.com/v1/speak?model=aura-helios-en"
DeepgramApiKey = "DEEPGRAM_API_KEY"
InputText = "Our story begins in a peaceful woodland kingdom where a lively squirrel named Frolic made his abode high up within a cedar tree's embrace. He was not a usual woodland creature, for he was blessed with an insatiable curiosity and a heart for adventure. Nearby, a glistening river snaked through the landscape, home to a wonder named Splash - a silver-scaled flying fish whose ability to break free from his water-haven intrigued the woodland onlookers. This magical world moved on a rhythm of its own until an unforeseen circumstance brought Frolic and Splash together. One radiant morning, while Frolic was on his regular excursion, and Splash was making his aerial tours, an unpredictable wave playfully tossed and misplaced Splash onto the riverbank. Despite his initial astonishment, Frolic hurriedly and kindly assisted his new friend back to his watery abode. Touched by Frolic's compassion, Splash expressed his gratitude by inviting his friend to share his world. As Splash perched on Frolic's back, he tasted of the forest's bounty, felt the sun’s rays filter through the colors of the trees, experienced the conversations amidst the woods, and while at it, taught the woodland how to blur the lines between earth and water."
OutputFile = "output.mp3"
)
// Segment the input text into sentences:
func segmentTextBySentence(text string) []string {
re := regexp.MustCompile(`[^.!?]+[.!?]`)
return re.FindAllString(text, -1)
}
func synthesizeAudio(text string, outputFile *os.File) error {
payload, err := json.Marshal(map[string]string{"text": text})
if err != nil {
return err
}
req, err := http.NewRequest("POST", DeepgramURL, bytes.NewBuffer(payload))
if err != nil {
return err
}
req.Header.Set("Authorization", "Token "+DeepgramApiKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("received non-OK response: %s", resp.Status)
}
// Stream the audio data from the response body directly to the output file
_, err = io.Copy(outputFile, resp.Body)
if err != nil {
return err
}
return nil
}
func main() {
segments := segmentTextBySentence(InputText)
outputFile, err := os.Create(OutputFile)
if err != nil {
fmt.Println("Error creating output file:", err)
return
}
defer outputFile.Close()
// Process each segment in order and stream audio data to the output file
for _, segment := range segments {
err := synthesizeAudio(segment, outputFile)
if err != nil {
fmt.Println("Error synthesizing audio:", err)
continue
}
fmt.Println("Audio stream finished for segment:", segment)
}
fmt.Println("Audio file creation completed.")
}
Read more about text chunking as an optimization strategy in the guide Text Chunking for TTS REST Optimization.
Tips
Chunked Transfer Encoding
Use a reasonable chunk size to strike a balance between efficiency and responsiveness. This allows for efficient processing of the response content without overwhelming memory resources.
The optimal chunk size might vary depending on the audio format, but for real-time applications where low latency is crucial, smaller chunks are generally preferred regardless of the audio format. Smaller chunks allow for faster data transmission and processing, reducing overall latency in the audio playback pipeline.
Dynamic Buffering
Implement dynamic buffering techniques to adapt to fluctuations in network conditions and audio playback requirements. Instead of using a fixed buffer size, dynamically adjust the buffer size based on factors such as network latency, available bandwidth, and audio playback latency.
This adaptive buffering approach helps optimize audio playback performance, ensuring smooth and uninterrupted streaming even under varying network conditions. Techniques such as buffer prediction algorithms or rate-based buffering can help dynamically adjust buffer sizes to maintain optimal audio streaming quality.
Buffer Prediction Algorithm Example:
class BufferPredictor:
def __init__(self):
self.buffer_size = DEFAULT_BUFFER_SIZE
def adjust_buffer_size(self, network_latency, bandwidth):
# Example buffer prediction algorithm based on network latency and bandwidth
predicted_buffer_size = calculate_buffer_size(network_latency, bandwidth)
self.buffer_size = predicted_buffer_size
# Usage
buffer_predictor = BufferPredictor()
network_latency = get_network_latency()
bandwidth = get_available_bandwidth()
buffer_predictor.adjust_buffer_size(network_latency, bandwidth)
print("Adjusted buffer size:", buffer_predictor.buffer_size)
class BufferPredictor {
constructor() {
this.bufferSize = DEFAULT_BUFFER_SIZE;
}
adjustBufferSize(networkLatency, bandwidth) {
// Example buffer prediction algorithm based on network latency and bandwidth
const predictedBufferSize = calculateBufferSize(networkLatency, bandwidth);
this.bufferSize = predictedBufferSize;
}
}
// Usage
const bufferPredictor = new BufferPredictor();
const networkLatency = getNetworkLatency();
const bandwidth = getAvailableBandwidth();
bufferPredictor.adjustBufferSize(networkLatency, bandwidth);
console.log("Adjusted buffer size:", bufferPredictor.bufferSize);
package main
import (
"fmt"
)
const defaultBufferSize = 1024 // Define your default buffer size
// BufferPredictor struct to hold buffer size and methods
type BufferPredictor struct {
BufferSize int
}
func NewBufferPredictor() *BufferPredictor {
return &BufferPredictor{
BufferSize: defaultBufferSize,
}
}
// AdjustBufferSize adjusts the buffer size based on network latency and bandwidth
func (bp *BufferPredictor) AdjustBufferSize(networkLatency, bandwidth int) {
// Example buffer prediction algorithm based on network latency and bandwidth
predictedBufferSize := calculateBufferSize(networkLatency, bandwidth)
bp.BufferSize = predictedBufferSize
}
func calculateBufferSize(networkLatency, bandwidth int) int {
// Example implementation, replace with your logic
return networkLatency * bandwidth
}
func main() {
bufferPredictor := NewBufferPredictor()
networkLatency := getNetworkLatency()
bandwidth := getAvailableBandwidth()
bufferPredictor.AdjustBufferSize(networkLatency, bandwidth)
fmt.Println("Adjusted buffer size:", bufferPredictor.BufferSize)
}
func getNetworkLatency() int {
// Example implementation, replace with your logic
return 100 // ms
}
func getAvailableBandwidth() int {
// Example implementation, replace with your logic
return 1000 // Kbps
}
Optimizing Streaming Performance
Select the audio format and configuration that best suits your streaming requirements and playback environment. Consider factors such as compression efficiency, network bandwidth, and device compatibility.
Efficient Streaming with Lower Bandwidth
Opus, AAC, and MP3 are considered efficient for streaming over networks with lower available bandwidth. These formats typically offer good compression without significant loss of audio quality, making them suitable for streaming applications where conserving bandwidth is crucial. They are optimized for efficient transmission and decoding, allowing for smoother playback even under bandwidth constraints.
High-Quality Streaming with Higher Bandwidth
FLAC and Linear PCM (linear16), along with AAC at high bitrates, are better suited for streaming over networks with higher available bandwidth. These formats prioritize audio quality over compression efficiency, resulting in higher fidelity audio reproduction. While they may require more bandwidth for transmission compared to efficient codecs, they deliver superior audio quality, making them ideal for scenarios where audio fidelity is paramount, such as music streaming or professional audio applications.
We'd love to get your feedback on Deepgram's Aura text-to-speech. You will receive $50 in additional console credits within two weeks after filling out the form, and you may be invited to join a group of users with access to the latest private releases. To fill out the form, Click Here.
Updated 2 months ago