Build a Voice Agent with Python

Create a real-time voice agent using the Deepgram Python SDK.

This tutorial walks you through building a basic voice agent using Python and the Deepgram SDK. You will learn how to connect to the Agent API, configure its behavior, and stream audio for processing.

Prerequisites

Before you begin, ensure you have the following:

A Deepgram API key. You can get one in the Deepgram Console.
Python installed on your machine.

1. Set up your environment

Create a new directory for your project and a file for your code.

$ mkdir deepgram-agent-demo
$ cd deepgram-agent-demo
$ touch main.py

Export your Deepgram API key as an environment variable.

$ export DEEPGRAM_API_KEY="your_api_key"

2. Install the Deepgram SDK

Install the Deepgram Python SDK and the requests library for audio streaming.

$ pip install deepgram-sdk requests

3. Create the Voice Agent

Open main.py and add the following code. This script connects to Deepgram, configures the agent with specific models, and streams a sample audio file.

1 import requests
2 import time
3 import os
4 import json
5 import threading
6 
7 from deepgram import DeepgramClient
8 from deepgram.core.events import EventType
9 from deepgram.agent.v1.types import (
10     AgentV1Settings,
11     AgentV1SettingsAgent,
12     AgentV1SettingsAudio,
13     AgentV1SettingsAudioInput,
14     AgentV1SettingsAudioOutput,
15     AgentV1SettingsAgentListen,
16     AgentV1SettingsAgentListenProvider_V1,
17 )
18 from deepgram.types.think_settings_v1 import ThinkSettingsV1
19 from deepgram.types.think_settings_v1provider import ThinkSettingsV1Provider_OpenAi
20 from deepgram.types.speak_settings_v1 import SpeakSettingsV1
21 from deepgram.types.speak_settings_v1provider import SpeakSettingsV1Provider_Deepgram
22 
23 def main():
24     try:
25         api_key = os.getenv("DEEPGRAM_API_KEY")
26         if not api_key:
27             raise ValueError("DEEPGRAM_API_KEY environment variable is not set")
28         
29         client = DeepgramClient(api_key=api_key)
30 
31         with client.agent.v1.connect() as connection:
32             print("Created WebSocket connection...")
33 
34             settings = AgentV1Settings(
35                 audio=AgentV1SettingsAudio(
36                     input=AgentV1SettingsAudioInput(
37                         encoding="linear16",
38                         sample_rate=24000,
39                     ),
40                     output=AgentV1SettingsAudioOutput(
41                         encoding="linear16",
42                         sample_rate=24000,
43                         container="wav",
44                     ),
45                 ),
46                 agent=AgentV1SettingsAgent(
47                     language="en",
48                     listen=AgentV1SettingsAgentListen(
49                         provider=AgentV1SettingsAgentListenProvider_V1(
50                             type="deepgram",
51                             model="nova-3",
52                         )
53                     ),
54                     think=ThinkSettingsV1(
55                         provider=ThinkSettingsV1Provider_OpenAi(
56                             type="open_ai",
57                             model="gpt-4o-mini",
58                         ),
59                         prompt="You are a friendly AI assistant.",
60                     ),
61                     speak=SpeakSettingsV1(
62                         provider=SpeakSettingsV1Provider_Deepgram(
63                             type="deepgram",
64                             model="aura-2-thalia-en",
65                         )
66                     ),
67                     greeting="Hello! How can I help you today?",
68                 ),
69             )
70 
71             audio_buffer = bytearray()
72             file_counter = 0
73             processing_complete = False
74 
75             def on_message(message):
76                 nonlocal audio_buffer, file_counter, processing_complete
77 
78                 if isinstance(message, bytes):
79                     audio_buffer.extend(message)
80                     return
81 
82                 msg_type = getattr(message, "type", "Unknown")
83                 
84                 if msg_type == "ConversationText":
85                     print(f"Conversation: {message}")
86                     with open("chatlog.txt", 'a') as chatlog:
87                         chatlog.write(f"{json.dumps(message.__dict__)}\n")
88 
89                 elif msg_type == "AgentAudioDone":
90                     print("Agent audio done")
91                     if len(audio_buffer) > 0:
92                         with open(f"output-{file_counter}.wav", 'wb') as f:
93                             f.write(create_wav_header())
94                             f.write(audio_buffer)
95                         print(f"Created output-{file_counter}.wav")
96                     audio_buffer = bytearray()
97                     file_counter += 1
98                     processing_complete = True
99 
100             connection.on(EventType.MESSAGE, on_message)
101             connection.send_settings(settings)
102 
103             listener_thread = threading.Thread(target=connection.start_listening, daemon=True)
104             listener_thread.start()
105 
106             time.sleep(1)
107 
108             print("Streaming audio...")
109             response = requests.get("https://dpgr.am/spacewalk.wav", stream=True)
110             header = response.raw.read(44)
111 
112             for chunk in response.iter_content(chunk_size=8192):
113                 if chunk:
114                     connection.send_media(chunk)
115                     time.sleep(0.1)
116 
117             print("Waiting for agent response...")
118             start_time = time.time()
119             while not processing_complete and (time.time() - start_time) < 30:
120                 time.sleep(1)
121 
122             print("Finished")
123 
124     except Exception as e:
125         print(f"Error: {str(e)}")
126 
127 def create_wav_header(sample_rate=24000, bits_per_sample=16, channels=1):
128     byte_rate = sample_rate * channels * (bits_per_sample // 8)
129     block_align = channels * (bits_per_sample // 8)
130     header = bytearray(44)
131     header[0:4] = b'RIFF'
132     header[4:8] = b'\x00\x00\x00\x00'
133     header[8:12] = b'WAVE'
134     header[12:16] = b'fmt '
135     header[16:20] = b'\x10\x00\x00\x00'
136     header[20:22] = b'\x01\x00'
137     header[22:24] = channels.to_bytes(2, 'little')
138     header[24:28] = sample_rate.to_bytes(4, 'little')
139     header[28:32] = byte_rate.to_bytes(4, 'little')
140     header[32:34] = block_align.to_bytes(2, 'little')
141     header[34:36] = bits_per_sample.to_bytes(2, 'little')
142     header[36:40] = b'data'
143     header[40:44] = b'\x00\x00\x00\x00'
144     return header
145 
146 if __name__ == "__main__":
147     main()

4. Run the Voice Agent

Run your script using Python.

$ python main.py

The agent will process the audio and generate responses. You can find the conversation transcript in chatlog.txt and the agent’s audio responses in output-*.wav files.

Next steps

Now that you have built a basic agent, you can customize its behavior:

Configure the Voice Agent: Explore all available settings for models and voices.
Build a Voice Agent: Return to the overview to see other language options.

$	mkdir deepgram-agent-demo
$	cd deepgram-agent-demo
$	touch main.py

1	import requests
2	import time
3	import os
4	import json
5	import threading
6
7	from deepgram import DeepgramClient
8	from deepgram.core.events import EventType
9	from deepgram.agent.v1.types import (
10	AgentV1Settings,
11	AgentV1SettingsAgent,
12	AgentV1SettingsAudio,
13	AgentV1SettingsAudioInput,
14	AgentV1SettingsAudioOutput,
15	AgentV1SettingsAgentListen,
16	AgentV1SettingsAgentListenProvider_V1,
17	)
18	from deepgram.types.think_settings_v1 import ThinkSettingsV1
19	from deepgram.types.think_settings_v1provider import ThinkSettingsV1Provider_OpenAi
20	from deepgram.types.speak_settings_v1 import SpeakSettingsV1
21	from deepgram.types.speak_settings_v1provider import SpeakSettingsV1Provider_Deepgram
22
23	def main():
24	try:
25	api_key = os.getenv("DEEPGRAM_API_KEY")
26	if not api_key:
27	raise ValueError("DEEPGRAM_API_KEY environment variable is not set")
28
29	client = DeepgramClient(api_key=api_key)
30
31	with client.agent.v1.connect() as connection:
32	print("Created WebSocket connection...")
33
34	settings = AgentV1Settings(
35	audio=AgentV1SettingsAudio(
36	input=AgentV1SettingsAudioInput(
37	encoding="linear16",
38	sample_rate=24000,
39	),
40	output=AgentV1SettingsAudioOutput(
41	encoding="linear16",
42	sample_rate=24000,
43	container="wav",
44	),
45	),
46	agent=AgentV1SettingsAgent(
47	language="en",
48	listen=AgentV1SettingsAgentListen(
49	provider=AgentV1SettingsAgentListenProvider_V1(
50	type="deepgram",
51	model="nova-3",
52	)
53	),
54	think=ThinkSettingsV1(
55	provider=ThinkSettingsV1Provider_OpenAi(
56	type="open_ai",
57	model="gpt-4o-mini",
58	),
59	prompt="You are a friendly AI assistant.",
60	),
61	speak=SpeakSettingsV1(
62	provider=SpeakSettingsV1Provider_Deepgram(
63	type="deepgram",
64	model="aura-2-thalia-en",
65	)
66	),
67	greeting="Hello! How can I help you today?",
68	),
69	)
70
71	audio_buffer = bytearray()
72	file_counter = 0
73	processing_complete = False
74
75	def on_message(message):
76	nonlocal audio_buffer, file_counter, processing_complete
77
78	if isinstance(message, bytes):
79	audio_buffer.extend(message)
80	return
81
82	msg_type = getattr(message, "type", "Unknown")
83
84	if msg_type == "ConversationText":
85	print(f"Conversation: {message}")
86	with open("chatlog.txt", 'a') as chatlog:
87	chatlog.write(f"{json.dumps(message.__dict__)}\n")
88
89	elif msg_type == "AgentAudioDone":
90	print("Agent audio done")
91	if len(audio_buffer) > 0:
92	with open(f"output-{file_counter}.wav", 'wb') as f:
93	f.write(create_wav_header())
94	f.write(audio_buffer)
95	print(f"Created output-{file_counter}.wav")
96	audio_buffer = bytearray()
97	file_counter += 1
98	processing_complete = True
99
100	connection.on(EventType.MESSAGE, on_message)
101	connection.send_settings(settings)
102
103	listener_thread = threading.Thread(target=connection.start_listening, daemon=True)
104	listener_thread.start()
105
106	time.sleep(1)
107
108	print("Streaming audio...")
109	response = requests.get("https://dpgr.am/spacewalk.wav", stream=True)
110	header = response.raw.read(44)
111
112	for chunk in response.iter_content(chunk_size=8192):
113	if chunk:
114	connection.send_media(chunk)
115	time.sleep(0.1)
116
117	print("Waiting for agent response...")
118	start_time = time.time()
119	while not processing_complete and (time.time() - start_time) < 30:
120	time.sleep(1)
121
122	print("Finished")
123
124	except Exception as e:
125	print(f"Error: {str(e)}")
126
127	def create_wav_header(sample_rate=24000, bits_per_sample=16, channels=1):
128	byte_rate = sample_rate * channels * (bits_per_sample // 8)
129	block_align = channels * (bits_per_sample // 8)
130	header = bytearray(44)
131	header[0:4] = b'RIFF'
132	header[4:8] = b'\x00\x00\x00\x00'
133	header[8:12] = b'WAVE'
134	header[12:16] = b'fmt '
135	header[16:20] = b'\x10\x00\x00\x00'
136	header[20:22] = b'\x01\x00'
137	header[22:24] = channels.to_bytes(2, 'little')
138	header[24:28] = sample_rate.to_bytes(4, 'little')
139	header[28:32] = byte_rate.to_bytes(4, 'little')
140	header[32:34] = block_align.to_bytes(2, 'little')
141	header[34:36] = bits_per_sample.to_bytes(2, 'little')
142	header[36:40] = b'data'
143	header[40:44] = b'\x00\x00\x00\x00'
144	return header
145
146	if __name__ == "__main__":
147	main()