Text to Speech REST

An overview of the Deepgram JavaScript SDK and Deepgram text-to-speech.

The Deepgram JavaScript SDK now works in both server and browser environments. A proxy configuration is required for browser environments (see the section below).

Installing the SDK

# Install the Deepgram JS SDK
# https://github.com/deepgram/deepgram-js-sdk

npm install @deepgram/sdk

Initializing the SDK

import { createClient } from "@deepgram/sdk";
import fs from "fs";

const deepgram = createClient("DEEPGRAM_API_KEY");

Make a Deepgram Text-to-Speech Request

Once the SDK is initialized, you can make a request to convert text into speech.

const text = "Hello, how can I help you today?";

const getAudio = async () => {
  // STEP 1: Make a request and configure the request with options (such as model choice, audio configuration, etc.)
  const response = await deepgram.speak.request(
    { text },
    {
      model: "aura-asteria-en",
      encoding: "linear16",
      container: "wav",
    }
  );

  // STEP 2: Get the audio stream and headers from the response
  const stream = await response.getStream();
  const headers = await response.getHeaders();
  if (stream) {
    // STEP 3: Convert the stream to an audio buffer
    const buffer = await getAudioBuffer(stream);
    // STEP 4: Write the audio buffer to a file
    fs.writeFile("output.wav", buffer, (err) => {
      if (err) {
        console.error("Error writing audio to file:", err);
      } else {
        console.log("Audio file written to output.wav");
      }
    });
  } else {
    console.error("Error generating audio:", stream);
  }

  if (headers) {
    console.log("Headers:", headers);
  }
};

// Helper function to convert the stream to an audio buffer
const getAudioBuffer = async (response) => {
  const reader = response.getReader();
  const chunks = [];

  while (true) {
    const { done, value } = await reader.read();
    if (done) break;

    chunks.push(value);
  }

  const dataArray = chunks.reduce(
    (acc, chunk) => Uint8Array.from([...acc, ...chunk]),
    new Uint8Array(0)
  );

  return Buffer.from(dataArray.buffer);
};

getAudio();

Audio Output Streaming

Deepgram's TTS API allows you to start playing the audio as soon as the first byte is received. This section provides examples to help you stream the audio output efficiently.

Single Text Source Payload

The following example demonstrates how to stream the audio as soon as the first byte arrives for a single text source.

const DEEPGRAM_API_KEY = 'YOUR_DEEPGRAM_API_KEY';
const deepgram = createClient(DEEPGRAM_API_KEY);

const text = "Hello, how can I help you today? My name is Emily and I'm very glad to meet you. What do you think of this new text-to-speech API?";
const audioFilePath = 'output.wav'; // Path to save the audio file

const getAudio = async () => {
  const response = await deepgram.speak.request(
    { text },
    {
      model: 'aura-asteria-en',
      encoding: 'linear16',
      container: 'wav',
    }
  );

  const stream = await response.getStream();
  if (stream) {
    const buffer = await getAudioBuffer(stream);
    fs.writeFile(audioFilePath, buffer, (err) => {
      if (err) {
        console.error('Error writing audio to file:', err);
      } else {
        console.log('Audio file written to', audioFilePath);
      }
    });
  } else {
    console.error('Error generating audio:', stream);
  }
};

const getAudioBuffer = async (response) => {
  const reader = response.getReader();
  const chunks = [];

  while (true) {
    const { done, value } = await reader.read();
    if (done) break;

    chunks.push(value);
  }

  const dataArray = chunks.reduce(
    (acc, chunk) => Uint8Array.from([...acc, ...chunk]),
    new Uint8Array(0)
  );

  return Buffer.from(dataArray.buffer);
};

getAudio();

Chunked Text Source Payload

This example shows how to chunk the text source by sentence boundaries and stream the audio for each chunk consecutively.

import fs from 'fs';
import { createClient } from '@deepgram/sdk';

const DEEPGRAM_API_KEY = 'YOUR_DEEPGRAM_API_KEY';
const deepgram = createClient(DEEPGRAM_API_KEY);

const inputText = "Your long text goes here...";

function segmentTextBySentence(text) {
  return text.match(/[^.!?]+[.!?]/g).map((sentence) => sentence.trim());
}

async function synthesizeAudio(text) {
  const response = await deepgram.speak.request(
    { text },
    {
      model: 'aura-helios-en',
      encoding: 'linear16',
      container: 'wav',
    }
  );

  const stream = await response.getStream();
  if (stream) {
    const buffer = await getAudioBuffer(stream);
    return buffer;
  } else {
    throw new Error('Error generating audio');
  }
}

const getAudioBuffer = async (response) => {
  const reader = response.getReader();
  const chunks = [];

  while (true) {
    const { done, value } = await reader.read();
    if (done) break;

    chunks.push(value);
  }

  const dataArray = chunks.reduce(
    (acc, chunk) => Uint8Array.from([...acc, ...chunk]),
    new Uint8Array(0)
  );

  return Buffer.from(dataArray.buffer);
};

async function main() {
  const segments = segmentTextBySentence(inputText);

  // Create or truncate the output file
  const outputFile = fs.createWriteStream("output.mp3");

  for (const segment of segments) {
    try {
      const audioData = await synthesizeAudio(segment);
      outputFile.write(audioData);
      console.log("Audio stream finished for segment:", segment);
    } catch (error) {
      console.error("Error synthesizing audio:", error);
    }
  }

  console.log("Audio file creation completed.");
}

main();

Where to Find Additional Examples

The SDK repository has a good collection of text-to-speech examples. The README contains links to them. Each example below attempts to provide different options for transcribing an audio source.

Some Example(s):