Build a Voice Agent with Python

Create a real-time voice agent using the Deepgram Python SDK.

This tutorial walks you through building a basic voice agent using Python and the Deepgram SDK. You will learn how to connect to the Agent API, configure its behavior, and stream audio for processing.

Prerequisites

Before you begin, ensure you have the following:

  • A Deepgram API key. You can get one in the Deepgram Console.
  • Python installed on your machine.

1. Set up your environment

Create a new directory for your project and a file for your code.

$mkdir deepgram-agent-demo
$cd deepgram-agent-demo
$touch main.py

Export your Deepgram API key as an environment variable.

$export DEEPGRAM_API_KEY="your_api_key"

2. Install the Deepgram SDK

Install the Deepgram Python SDK and the requests library for audio streaming.

$pip install deepgram-sdk requests

3. Create the Voice Agent

Open main.py and add the following code. This script connects to Deepgram, configures the agent with specific models, and streams a sample audio file.

1import requests
2import time
3import os
4import json
5import threading
6
7from deepgram import DeepgramClient
8from deepgram.core.events import EventType
9from deepgram.agent.v1.types import (
10 AgentV1Settings,
11 AgentV1SettingsAgent,
12 AgentV1SettingsAudio,
13 AgentV1SettingsAudioInput,
14 AgentV1SettingsAudioOutput,
15 AgentV1SettingsAgentListen,
16 AgentV1SettingsAgentListenProvider_V1,
17)
18from deepgram.types.think_settings_v1 import ThinkSettingsV1
19from deepgram.types.think_settings_v1provider import ThinkSettingsV1Provider_OpenAi
20from deepgram.types.speak_settings_v1 import SpeakSettingsV1
21from deepgram.types.speak_settings_v1provider import SpeakSettingsV1Provider_Deepgram
22
23def main():
24 try:
25 api_key = os.getenv("DEEPGRAM_API_KEY")
26 if not api_key:
27 raise ValueError("DEEPGRAM_API_KEY environment variable is not set")
28
29 client = DeepgramClient(api_key=api_key)
30
31 with client.agent.v1.connect() as connection:
32 print("Created WebSocket connection...")
33
34 settings = AgentV1Settings(
35 audio=AgentV1SettingsAudio(
36 input=AgentV1SettingsAudioInput(
37 encoding="linear16",
38 sample_rate=24000,
39 ),
40 output=AgentV1SettingsAudioOutput(
41 encoding="linear16",
42 sample_rate=24000,
43 container="wav",
44 ),
45 ),
46 agent=AgentV1SettingsAgent(
47 language="en",
48 listen=AgentV1SettingsAgentListen(
49 provider=AgentV1SettingsAgentListenProvider_V1(
50 type="deepgram",
51 model="nova-3",
52 )
53 ),
54 think=ThinkSettingsV1(
55 provider=ThinkSettingsV1Provider_OpenAi(
56 type="open_ai",
57 model="gpt-4o-mini",
58 ),
59 prompt="You are a friendly AI assistant.",
60 ),
61 speak=SpeakSettingsV1(
62 provider=SpeakSettingsV1Provider_Deepgram(
63 type="deepgram",
64 model="aura-2-thalia-en",
65 )
66 ),
67 greeting="Hello! How can I help you today?",
68 ),
69 )
70
71 audio_buffer = bytearray()
72 file_counter = 0
73 processing_complete = False
74
75 def on_message(message):
76 nonlocal audio_buffer, file_counter, processing_complete
77
78 if isinstance(message, bytes):
79 audio_buffer.extend(message)
80 return
81
82 msg_type = getattr(message, "type", "Unknown")
83
84 if msg_type == "ConversationText":
85 print(f"Conversation: {message}")
86 with open("chatlog.txt", 'a') as chatlog:
87 chatlog.write(f"{json.dumps(message.__dict__)}\n")
88
89 elif msg_type == "AgentAudioDone":
90 print("Agent audio done")
91 if len(audio_buffer) > 0:
92 with open(f"output-{file_counter}.wav", 'wb') as f:
93 f.write(create_wav_header())
94 f.write(audio_buffer)
95 print(f"Created output-{file_counter}.wav")
96 audio_buffer = bytearray()
97 file_counter += 1
98 processing_complete = True
99
100 connection.on(EventType.MESSAGE, on_message)
101 connection.send_settings(settings)
102
103 listener_thread = threading.Thread(target=connection.start_listening, daemon=True)
104 listener_thread.start()
105
106 time.sleep(1)
107
108 print("Streaming audio...")
109 response = requests.get("https://dpgr.am/spacewalk.wav", stream=True)
110 header = response.raw.read(44)
111
112 for chunk in response.iter_content(chunk_size=8192):
113 if chunk:
114 connection.send_media(chunk)
115 time.sleep(0.1)
116
117 print("Waiting for agent response...")
118 start_time = time.time()
119 while not processing_complete and (time.time() - start_time) < 30:
120 time.sleep(1)
121
122 print("Finished")
123
124 except Exception as e:
125 print(f"Error: {str(e)}")
126
127def create_wav_header(sample_rate=24000, bits_per_sample=16, channels=1):
128 byte_rate = sample_rate * channels * (bits_per_sample // 8)
129 block_align = channels * (bits_per_sample // 8)
130 header = bytearray(44)
131 header[0:4] = b'RIFF'
132 header[4:8] = b'\x00\x00\x00\x00'
133 header[8:12] = b'WAVE'
134 header[12:16] = b'fmt '
135 header[16:20] = b'\x10\x00\x00\x00'
136 header[20:22] = b'\x01\x00'
137 header[22:24] = channels.to_bytes(2, 'little')
138 header[24:28] = sample_rate.to_bytes(4, 'little')
139 header[28:32] = byte_rate.to_bytes(4, 'little')
140 header[32:34] = block_align.to_bytes(2, 'little')
141 header[34:36] = bits_per_sample.to_bytes(2, 'little')
142 header[36:40] = b'data'
143 header[40:44] = b'\x00\x00\x00\x00'
144 return header
145
146if __name__ == "__main__":
147 main()

4. Run the Voice Agent

Run your script using Python.

$python main.py

The agent will process the audio and generate responses. You can find the conversation transcript in chatlog.txt and the agent’s audio responses in output-*.wav files.

Next steps

Now that you have built a basic agent, you can customize its behavior: