1 | # Copyright 2025 Deepgram SDK contributors. All Rights Reserved. |
2 | # Use of this source code is governed by a MIT license that can be found in the LICENSE file. |
3 | # SPDX-License-Identifier: MIT |
4 | |
5 | # Import dependencies and set up the main function |
6 | import requests |
7 | import wave |
8 | import io |
9 | import time |
10 | import os |
11 | import json |
12 | import threading |
13 | from datetime import datetime |
14 | |
15 | from deepgram import ( |
16 | DeepgramClient, |
17 | DeepgramClientOptions, |
18 | AgentWebSocketEvents, |
19 | AgentKeepAlive, |
20 | ) |
21 | from deepgram.clients.agent.v1.websocket.options import SettingsOptions |
22 | |
23 | def main(): |
24 | try: |
25 | # Initialize the Voice Agent |
26 | api_key = os.getenv("DEEPGRAM_API_KEY") |
27 | if not api_key: |
28 | raise ValueError("DEEPGRAM_API_KEY environment variable is not set") |
29 | print(f"API Key found:") |
30 | |
31 | # Initialize Deepgram client |
32 | config = DeepgramClientOptions( |
33 | options={ |
34 | "keepalive": "true", |
35 | # "speaker_playback": "true", |
36 | }, |
37 | ) |
38 | deepgram = DeepgramClient(api_key, config) |
39 | connection = deepgram.agent.websocket.v("1") |
40 | print("Created WebSocket connection...") |
41 | |
42 | # 4. Configure the Agent |
43 | options = SettingsOptions() |
44 | # Audio input configuration |
45 | options.audio.input.encoding = "linear16" |
46 | options.audio.input.sample_rate = 24000 |
47 | # Audio output configuration |
48 | options.audio.output.encoding = "linear16" |
49 | options.audio.output.sample_rate = 24000 |
50 | options.audio.output.container = "wav" |
51 | # Agent configuration |
52 | options.agent.language = "en" |
53 | options.agent.listen.provider.type = "deepgram" |
54 | options.agent.listen.model = "nova-3" |
55 | options.agent.think.provider.type = "open_ai" |
56 | options.agent.think.model = "gpt-4o-mini" |
57 | options.agent.think.prompt = "You are a friendly AI assistant." |
58 | options.agent.speak.provider.type = "deepgram" |
59 | options.agent.speak.model = "aura-2-thalia-en" |
60 | options.agent.greeting = "Hello! How can I help you today?" |
61 | |
62 | # Send Keep Alive messages |
63 | def send_keep_alive(): |
64 | while True: |
65 | time.sleep(5) |
66 | print("Keep alive!") |
67 | connection.send(str(AgentKeepAlive())) |
68 | |
69 | # Start keep-alive in a separate thread |
70 | keep_alive_thread = threading.Thread(target=send_keep_alive, daemon=True) |
71 | keep_alive_thread.start() |
72 | |
73 | # Setup Event Handlers |
74 | audio_buffer = bytearray() |
75 | file_counter = 0 |
76 | processing_complete = False |
77 | |
78 | def on_audio_data(self, data, **kwargs): |
79 | nonlocal audio_buffer |
80 | audio_buffer.extend(data) |
81 | print(f"Received audio data from agent: {len(data)} bytes") |
82 | print(f"Total buffer size: {len(audio_buffer)} bytes") |
83 | print(f"Audio data format: {data[:16].hex()}...") |
84 | |
85 | def on_agent_audio_done(self, agent_audio_done, **kwargs): |
86 | nonlocal audio_buffer, file_counter, processing_complete |
87 | print(f"AgentAudioDone event received") |
88 | print(f"Buffer size at completion: {len(audio_buffer)} bytes") |
89 | print(f"Agent audio done: {agent_audio_done}") |
90 | if len(audio_buffer) > 0: |
91 | with open(f"output-{file_counter}.wav", 'wb') as f: |
92 | f.write(create_wav_header()) |
93 | f.write(audio_buffer) |
94 | print(f"Created output-{file_counter}.wav") |
95 | audio_buffer = bytearray() |
96 | file_counter += 1 |
97 | processing_complete = True |
98 | |
99 | def on_conversation_text(self, conversation_text, **kwargs): |
100 | print(f"Conversation Text: {conversation_text}") |
101 | with open("chatlog.txt", 'a') as chatlog: |
102 | chatlog.write(f"{json.dumps(conversation_text.__dict__)}\n") |
103 | |
104 | def on_welcome(self, welcome, **kwargs): |
105 | print(f"Welcome message received: {welcome}") |
106 | with open("chatlog.txt", 'a') as chatlog: |
107 | chatlog.write(f"Welcome message: {welcome}\n") |
108 | |
109 | def on_settings_applied(self, settings_applied, **kwargs): |
110 | print(f"Settings applied: {settings_applied}") |
111 | with open("chatlog.txt", 'a') as chatlog: |
112 | chatlog.write(f"Settings applied: {settings_applied}\n") |
113 | |
114 | def on_user_started_speaking(self, user_started_speaking, **kwargs): |
115 | print(f"User Started Speaking: {user_started_speaking}") |
116 | with open("chatlog.txt", 'a') as chatlog: |
117 | chatlog.write(f"User Started Speaking: {user_started_speaking}\n") |
118 | |
119 | def on_agent_thinking(self, agent_thinking, **kwargs): |
120 | print(f"Agent Thinking: {agent_thinking}") |
121 | with open("chatlog.txt", 'a') as chatlog: |
122 | chatlog.write(f"Agent Thinking: {agent_thinking}\n") |
123 | |
124 | def on_agent_started_speaking(self, agent_started_speaking, **kwargs): |
125 | nonlocal audio_buffer |
126 | audio_buffer = bytearray() # Reset buffer for new response |
127 | print(f"Agent Started Speaking: {agent_started_speaking}") |
128 | with open("chatlog.txt", 'a') as chatlog: |
129 | chatlog.write(f"Agent Started Speaking: {agent_started_speaking}\n") |
130 | |
131 | def on_close(self, close, **kwargs): |
132 | print(f"Connection closed: {close}") |
133 | with open("chatlog.txt", 'a') as chatlog: |
134 | chatlog.write(f"Connection closed: {close}\n") |
135 | |
136 | def on_error(self, error, **kwargs): |
137 | print(f"Error: {error}") |
138 | with open("chatlog.txt", 'a') as chatlog: |
139 | chatlog.write(f"Error: {error}\n") |
140 | |
141 | def on_unhandled(self, unhandled, **kwargs): |
142 | print(f"Unhandled event: {unhandled}") |
143 | with open("chatlog.txt", 'a') as chatlog: |
144 | chatlog.write(f"Unhandled event: {unhandled}\n") |
145 | |
146 | # Register handlers |
147 | connection.on(AgentWebSocketEvents.AudioData, on_audio_data) |
148 | connection.on(AgentWebSocketEvents.AgentAudioDone, on_agent_audio_done) |
149 | connection.on(AgentWebSocketEvents.ConversationText, on_conversation_text) |
150 | connection.on(AgentWebSocketEvents.Welcome, on_welcome) |
151 | connection.on(AgentWebSocketEvents.SettingsApplied, on_settings_applied) |
152 | connection.on(AgentWebSocketEvents.UserStartedSpeaking, on_user_started_speaking) |
153 | connection.on(AgentWebSocketEvents.AgentThinking, on_agent_thinking) |
154 | connection.on(AgentWebSocketEvents.AgentStartedSpeaking, on_agent_started_speaking) |
155 | connection.on(AgentWebSocketEvents.Close, on_close) |
156 | connection.on(AgentWebSocketEvents.Error, on_error) |
157 | connection.on(AgentWebSocketEvents.Unhandled, on_unhandled) |
158 | print("Event handlers registered") |
159 | |
160 | # Start the connection |
161 | print("Starting WebSocket connection...") |
162 | if not connection.start(options): |
163 | print("Failed to start connection") |
164 | return |
165 | print("WebSocket connection started successfully") |
166 | |
167 | # Stream audio |
168 | print("Downloading and sending audio...") |
169 | response = requests.get("https://dpgr.am/spacewalk.wav", stream=True) |
170 | # Skip WAV header |
171 | header = response.raw.read(44) |
172 | |
173 | # Verify WAV header |
174 | if header[0:4] != b'RIFF' or header[8:12] != b'WAVE': |
175 | print("Invalid WAV header") |
176 | return |
177 | |
178 | # Extract sample rate from header |
179 | sample_rate = int.from_bytes(header[24:28], 'little') |
180 | |
181 | chunk_size = 8192 |
182 | total_bytes_sent = 0 |
183 | chunk_count = 0 |
184 | for chunk in response.iter_content(chunk_size=chunk_size): |
185 | if chunk: |
186 | print(f"Sending chunk {chunk_count}: {len(chunk)} bytes") |
187 | connection.send(chunk) |
188 | total_bytes_sent += len(chunk) |
189 | chunk_count += 1 |
190 | time.sleep(0.1) # Small delay between chunks |
191 | |
192 | print(f"Total audio data sent: {total_bytes_sent} bytes in {chunk_count} chunks") |
193 | print("Waiting for agent response...") |
194 | |
195 | # Wait for processing |
196 | print("Waiting for processing to complete...") |
197 | start_time = time.time() |
198 | timeout = 30 # 30 second timeout |
199 | |
200 | while not processing_complete and (time.time() - start_time) < timeout: |
201 | time.sleep(1) |
202 | print(f"Still waiting for agent response... ({int(time.time() - start_time)}s elapsed)") |
203 | |
204 | if not processing_complete: |
205 | print("Processing timed out after 30 seconds") |
206 | else: |
207 | print("Processing complete. Check output-*.wav and chatlog.txt for results.") |
208 | |
209 | # Cleanup |
210 | connection.finish() |
211 | print("Finished") |
212 | |
213 | except Exception as e: |
214 | print(f"Error: {str(e)}") |
215 | |
216 | # WAV Header Functions |
217 | def create_wav_header(sample_rate=24000, bits_per_sample=16, channels=1): |
218 | """Create a WAV header with the specified parameters""" |
219 | byte_rate = sample_rate * channels * (bits_per_sample // 8) |
220 | block_align = channels * (bits_per_sample // 8) |
221 | |
222 | header = bytearray(44) |
223 | # RIFF header |
224 | header[0:4] = b'RIFF' |
225 | header[4:8] = b'\x00\x00\x00\x00' # File size (to be updated later) |
226 | header[8:12] = b'WAVE' |
227 | # fmt chunk |
228 | header[12:16] = b'fmt ' |
229 | header[16:20] = b'\x10\x00\x00\x00' # Subchunk1Size (16 for PCM) |
230 | header[20:22] = b'\x01\x00' # AudioFormat (1 for PCM) |
231 | header[22:24] = channels.to_bytes(2, 'little') # NumChannels |
232 | header[24:28] = sample_rate.to_bytes(4, 'little') # SampleRate |
233 | header[28:32] = byte_rate.to_bytes(4, 'little') # ByteRate |
234 | header[32:34] = block_align.to_bytes(2, 'little') # BlockAlign |
235 | header[34:36] = bits_per_sample.to_bytes(2, 'little') # BitsPerSample |
236 | # data chunk |
237 | header[36:40] = b'data' |
238 | header[40:44] = b'\x00\x00\x00\x00' # Subchunk2Size (to be updated later) |
239 | |
240 | return header |
241 | |
242 | if __name__ == "__main__": |
243 | main() |