| 1 | # For more Python SDK migration guides, visit: |
| 2 | # https://github.com/deepgram/deepgram-python-sdk/tree/main/docs |
| 3 | |
| 4 | # Copyright 2025 Deepgram SDK contributors. All Rights Reserved. |
| 5 | # Use of this source code is governed by a MIT license that can be found in the LICENSE file. |
| 6 | # SPDX-License-Identifier: MIT |
| 7 | |
| 8 | # Import dependencies and set up the main function |
| 9 | import requests |
| 10 | import time |
| 11 | import os |
| 12 | import json |
| 13 | import threading |
| 14 | |
| 15 | from deepgram import DeepgramClient |
| 16 | from deepgram.core.events import EventType |
| 17 | from deepgram.agent.v1.types import ( |
| 18 | AgentV1Settings, |
| 19 | AgentV1SettingsAgent, |
| 20 | AgentV1SettingsAudio, |
| 21 | AgentV1SettingsAudioInput, |
| 22 | AgentV1SettingsAudioOutput, |
| 23 | AgentV1SettingsAgentListen, |
| 24 | AgentV1SettingsAgentListenProvider_V1, |
| 25 | ) |
| 26 | from deepgram.types.think_settings_v1 import ThinkSettingsV1 |
| 27 | from deepgram.types.think_settings_v1provider import ThinkSettingsV1Provider_OpenAi |
| 28 | from deepgram.types.speak_settings_v1 import SpeakSettingsV1 |
| 29 | from deepgram.types.speak_settings_v1provider import SpeakSettingsV1Provider_Deepgram |
| 30 | |
| 31 | def main(): |
| 32 | try: |
| 33 | # Initialize the Voice Agent |
| 34 | api_key = os.getenv("DEEPGRAM_API_KEY") |
| 35 | if not api_key: |
| 36 | raise ValueError("DEEPGRAM_API_KEY environment variable is not set") |
| 37 | print("API Key found") |
| 38 | |
| 39 | # Initialize Deepgram client |
| 40 | client = DeepgramClient(api_key=api_key) |
| 41 | |
| 42 | # Use connection as a context manager |
| 43 | with client.agent.v1.connect() as connection: |
| 44 | print("Created WebSocket connection...") |
| 45 | |
| 46 | # Configure the Agent |
| 47 | settings = AgentV1Settings( |
| 48 | audio=AgentV1SettingsAudio( |
| 49 | input=AgentV1SettingsAudioInput( |
| 50 | encoding="linear16", |
| 51 | sample_rate=24000, |
| 52 | ), |
| 53 | output=AgentV1SettingsAudioOutput( |
| 54 | encoding="linear16", |
| 55 | sample_rate=24000, |
| 56 | container="wav", |
| 57 | ), |
| 58 | ), |
| 59 | agent=AgentV1SettingsAgent( |
| 60 | language="en", |
| 61 | listen=AgentV1SettingsAgentListen( |
| 62 | provider=AgentV1SettingsAgentListenProvider_V1( |
| 63 | type="deepgram", |
| 64 | model="nova-3", |
| 65 | ) |
| 66 | ), |
| 67 | think=ThinkSettingsV1( |
| 68 | provider=ThinkSettingsV1Provider_OpenAi( |
| 69 | type="open_ai", |
| 70 | model="gpt-4o-mini", |
| 71 | ), |
| 72 | prompt="You are a friendly AI assistant.", |
| 73 | ), |
| 74 | speak=SpeakSettingsV1( |
| 75 | provider=SpeakSettingsV1Provider_Deepgram( |
| 76 | type="deepgram", |
| 77 | model="aura-2-thalia-en", |
| 78 | ) |
| 79 | ), |
| 80 | greeting="Hello! How can I help you today?", |
| 81 | ), |
| 82 | ) |
| 83 | |
| 84 | # Setup Event Handlers |
| 85 | audio_buffer = bytearray() |
| 86 | file_counter = 0 |
| 87 | processing_complete = False |
| 88 | |
| 89 | def on_open(event): |
| 90 | print("Connection opened") |
| 91 | |
| 92 | def on_message(message): |
| 93 | nonlocal audio_buffer, file_counter, processing_complete |
| 94 | |
| 95 | # Handle binary audio data |
| 96 | if isinstance(message, bytes): |
| 97 | audio_buffer.extend(message) |
| 98 | print(f"Received audio data: {len(message)} bytes") |
| 99 | return |
| 100 | |
| 101 | # Handle different message types |
| 102 | msg_type = getattr(message, "type", "Unknown") |
| 103 | print(f"Received {msg_type} event") |
| 104 | |
| 105 | # Handle specific event types |
| 106 | if msg_type == "Welcome": |
| 107 | print(f"Welcome: {message}") |
| 108 | with open("chatlog.txt", 'a') as chatlog: |
| 109 | chatlog.write(f"Welcome: {message}\n") |
| 110 | |
| 111 | elif msg_type == "SettingsApplied": |
| 112 | print(f"Settings applied: {message}") |
| 113 | with open("chatlog.txt", 'a') as chatlog: |
| 114 | chatlog.write(f"Settings applied: {message}\n") |
| 115 | |
| 116 | elif msg_type == "ConversationText": |
| 117 | print(f"Conversation: {message}") |
| 118 | with open("chatlog.txt", 'a') as chatlog: |
| 119 | chatlog.write(f"{json.dumps(message.__dict__)}\n") |
| 120 | |
| 121 | elif msg_type == "UserStartedSpeaking": |
| 122 | print(f"User started speaking") |
| 123 | with open("chatlog.txt", 'a') as chatlog: |
| 124 | chatlog.write(f"User started speaking\n") |
| 125 | |
| 126 | elif msg_type == "AgentThinking": |
| 127 | print(f"Agent thinking") |
| 128 | with open("chatlog.txt", 'a') as chatlog: |
| 129 | chatlog.write(f"Agent thinking\n") |
| 130 | |
| 131 | elif msg_type == "AgentStartedSpeaking": |
| 132 | audio_buffer = bytearray() # Reset buffer for new response |
| 133 | print(f"Agent started speaking") |
| 134 | with open("chatlog.txt", 'a') as chatlog: |
| 135 | chatlog.write(f"Agent started speaking\n") |
| 136 | |
| 137 | elif msg_type == "AgentAudioDone": |
| 138 | print(f"Agent audio done") |
| 139 | if len(audio_buffer) > 0: |
| 140 | with open(f"output-{file_counter}.wav", 'wb') as f: |
| 141 | f.write(create_wav_header()) |
| 142 | f.write(audio_buffer) |
| 143 | print(f"Created output-{file_counter}.wav") |
| 144 | audio_buffer = bytearray() |
| 145 | file_counter += 1 |
| 146 | processing_complete = True |
| 147 | |
| 148 | def on_error(error): |
| 149 | print(f"Error: {error}") |
| 150 | with open("chatlog.txt", 'a') as chatlog: |
| 151 | chatlog.write(f"Error: {error}\n") |
| 152 | |
| 153 | def on_close(event): |
| 154 | print(f"Connection closed") |
| 155 | with open("chatlog.txt", 'a') as chatlog: |
| 156 | chatlog.write(f"Connection closed\n") |
| 157 | |
| 158 | # Register event handlers |
| 159 | connection.on(EventType.OPEN, on_open) |
| 160 | connection.on(EventType.MESSAGE, on_message) |
| 161 | connection.on(EventType.ERROR, on_error) |
| 162 | connection.on(EventType.CLOSE, on_close) |
| 163 | print("Event handlers registered") |
| 164 | |
| 165 | # Send settings to configure the agent |
| 166 | print("Sending settings configuration...") |
| 167 | connection.send_settings(settings) |
| 168 | print("Settings sent successfully") |
| 169 | |
| 170 | # Start listening for events in a background thread |
| 171 | print("Starting event listener...") |
| 172 | listener_thread = threading.Thread(target=connection.start_listening, daemon=True) |
| 173 | listener_thread.start() |
| 174 | |
| 175 | # Wait a moment for connection to establish |
| 176 | time.sleep(1) |
| 177 | |
| 178 | # Stream audio |
| 179 | print("Downloading and sending audio...") |
| 180 | response = requests.get("https://dpgr.am/spacewalk.wav", stream=True) |
| 181 | # Skip WAV header |
| 182 | header = response.raw.read(44) |
| 183 | |
| 184 | # Verify WAV header |
| 185 | if header[0:4] != b'RIFF' or header[8:12] != b'WAVE': |
| 186 | print("Invalid WAV header") |
| 187 | return |
| 188 | |
| 189 | chunk_size = 8192 |
| 190 | total_bytes_sent = 0 |
| 191 | chunk_count = 0 |
| 192 | for chunk in response.iter_content(chunk_size=chunk_size): |
| 193 | if chunk: |
| 194 | print(f"Sending chunk {chunk_count}: {len(chunk)} bytes") |
| 195 | connection.send_media(chunk) |
| 196 | total_bytes_sent += len(chunk) |
| 197 | chunk_count += 1 |
| 198 | time.sleep(0.1) # Small delay between chunks |
| 199 | |
| 200 | print(f"Total audio data sent: {total_bytes_sent} bytes in {chunk_count} chunks") |
| 201 | print("Waiting for agent response...") |
| 202 | |
| 203 | # Wait for processing |
| 204 | print("Waiting for processing to complete...") |
| 205 | start_time = time.time() |
| 206 | timeout = 30 # 30 second timeout |
| 207 | |
| 208 | while not processing_complete and (time.time() - start_time) < timeout: |
| 209 | time.sleep(1) |
| 210 | print(f"Still waiting for agent response... ({int(time.time() - start_time)}s elapsed)") |
| 211 | |
| 212 | if not processing_complete: |
| 213 | print("Processing timed out after 30 seconds") |
| 214 | else: |
| 215 | print("Processing complete. Check output-*.wav and chatlog.txt for results.") |
| 216 | |
| 217 | print("Finished") |
| 218 | |
| 219 | except Exception as e: |
| 220 | print(f"Error: {str(e)}") |
| 221 | |
| 222 | # WAV Header Functions |
| 223 | def create_wav_header(sample_rate=24000, bits_per_sample=16, channels=1): |
| 224 | """Create a WAV header with the specified parameters""" |
| 225 | byte_rate = sample_rate * channels * (bits_per_sample // 8) |
| 226 | block_align = channels * (bits_per_sample // 8) |
| 227 | |
| 228 | header = bytearray(44) |
| 229 | # RIFF header |
| 230 | header[0:4] = b'RIFF' |
| 231 | header[4:8] = b'\x00\x00\x00\x00' # File size (to be updated later) |
| 232 | header[8:12] = b'WAVE' |
| 233 | # fmt chunk |
| 234 | header[12:16] = b'fmt ' |
| 235 | header[16:20] = b'\x10\x00\x00\x00' # Subchunk1Size (16 for PCM) |
| 236 | header[20:22] = b'\x01\x00' # AudioFormat (1 for PCM) |
| 237 | header[22:24] = channels.to_bytes(2, 'little') # NumChannels |
| 238 | header[24:28] = sample_rate.to_bytes(4, 'little') # SampleRate |
| 239 | header[28:32] = byte_rate.to_bytes(4, 'little') # ByteRate |
| 240 | header[32:34] = block_align.to_bytes(2, 'little') # BlockAlign |
| 241 | header[34:36] = bits_per_sample.to_bytes(2, 'little') # BitsPerSample |
| 242 | # data chunk |
| 243 | header[36:40] = b'data' |
| 244 | header[40:44] = b'\x00\x00\x00\x00' # Subchunk2Size (to be updated later) |
| 245 | |
| 246 | return header |
| 247 | |
| 248 | if __name__ == "__main__": |
| 249 | main() |