From 06fa7936a3d3a0b918a0ad0c8b98a8959f643db1 Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 22:06:00 -0400
Subject: [PATCH 01/16] Backend Server Update

---
 .gitignore         |   1 +
 Backend/index.html | 461 ++++++++++++++++++++-------------------------
 Backend/server.py  | 182 +++++++++++++++---
 3 files changed, 360 insertions(+), 284 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1170717..e06d006 100644
--- a/.gitignore
+++ b/.gitignore
@@ -134,3 +134,4 @@ dist
 .yarn/build-state.yml
 .yarn/install-state.gz
 .pnp.*
+Backend/test.py
diff --git a/Backend/index.html b/Backend/index.html
index 309364f..f4ff6a0 100644
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -10,60 +10,113 @@
             max-width: 800px;
             margin: 0 auto;
             padding: 20px;
+            background-color: #f9f9f9;
         }
         .conversation {
-            border: 1px solid #ccc;
-            border-radius: 8px;
-            padding: 15px;
-            height: 300px;
+            border: 1px solid #ddd;
+            border-radius: 12px;
+            padding: 20px;
+            height: 400px;
             overflow-y: auto;
-            margin-bottom: 15px;
+            margin-bottom: 20px;
+            background-color: white;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.05);
         }
         .message {
-            margin-bottom: 10px;
-            padding: 8px;
-            border-radius: 8px;
+            margin-bottom: 15px;
+            padding: 12px;
+            border-radius: 12px;
+            max-width: 80%;
+            line-height: 1.4;
         }
         .user {
             background-color: #e3f2fd;
             text-align: right;
+            margin-left: auto;
+            border-bottom-right-radius: 4px;
         }
         .ai {
             background-color: #f1f1f1;
+            margin-right: auto;
+            border-bottom-left-radius: 4px;
+        }
+        .system {
+            background-color: #f8f9fa;
+            font-style: italic;
+            text-align: center;
+            font-size: 0.9em;
+            color: #666;
+            padding: 8px;
+            margin: 10px auto;
+            max-width: 90%;
         }
         .controls {
             display: flex;
-            flex-direction: column;
-            gap: 10px;
-        }
-        .input-row {
-            display: flex;
-            gap: 10px;
-        }
-        input[type="text"] {
-            flex-grow: 1;
-            padding: 8px;
-            border-radius: 4px;
-            border: 1px solid #ccc;
+            gap: 15px;
+            justify-content: center;
+            align-items: center;
         }
         button {
-            padding: 8px 16px;
-            border-radius: 4px;
+            padding: 12px 24px;
+            border-radius: 24px;
             border: none;
             background-color: #4CAF50;
             color: white;
             cursor: pointer;
+            font-weight: bold;
+            transition: all 0.2s ease;
+            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
         }
         button:hover {
             background-color: #45a049;
+            box-shadow: 0 4px 8px rgba(0,0,0,0.15);
         }
         .recording {
             background-color: #f44336;
+            animation: pulse 1.5s infinite;
+        }
+        .processing {
+            background-color: #FFA500;
         }
         select {
-            padding: 8px;
-            border-radius: 4px;
-            border: 1px solid #ccc;
+            padding: 10px;
+            border-radius: 24px;
+            border: 1px solid #ddd;
+            background-color: white;
+        }
+        .transcript {
+            font-style: italic;
+            color: #666;
+            margin-top: 5px;
+        }
+        @keyframes pulse {
+            0% { opacity: 1; }
+            50% { opacity: 0.7; }
+            100% { opacity: 1; }
+        }
+        .status-indicator {
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            margin-top: 10px;
+            gap: 5px;
+        }
+        .status-dot {
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+            background-color: #ccc;
+        }
+        .status-dot.active {
+            background-color: #4CAF50;
+        }
+        .status-text {
+            font-size: 0.9em;
+            color: #666;
+        }
+        audio {
+            width: 100%;
+            margin-top: 5px;
         }
     </style>
 </head>
@@ -72,30 +125,25 @@
     <div class="conversation" id="conversation"></div>
     
     <div class="controls">
-        <div class="input-row">
-            <input type="text" id="textInput" placeholder="Type your message...">
-            <select id="speakerSelect">
-                <option value="0">Speaker 0</option>
-                <option value="1">Speaker 1</option>
-            </select>
-            <button id="sendText">Send</button>
-        </div>
-        
-        <div class="input-row">
-            <button id="recordAudio">Record Audio</button>
-            <button id="clearContext">Clear Context</button>
-        </div>
+        <select id="speakerSelect">
+            <option value="0">Speaker 0</option>
+            <option value="1">Speaker 1</option>
+        </select>
+        <button id="streamButton">Start Conversation</button>
+        <button id="clearButton">Clear Chat</button>
+    </div>
+    
+    <div class="status-indicator">
+        <div class="status-dot" id="statusDot"></div>
+        <div class="status-text" id="statusText">Not connected</div>
     </div>
 
     <script>
+        // Variables
         let ws;
-        let mediaRecorder;
-        let audioChunks = [];
-        let isRecording = false;
         let audioContext;
         let streamProcessor;
         let isStreaming = false;
-        let streamButton;
         let isSpeaking = false;
         let silenceTimer = null;
         let energyWindow = [];
@@ -105,24 +153,20 @@
         
         // DOM elements
         const conversationEl = document.getElementById('conversation');
-        const textInputEl = document.getElementById('textInput');
         const speakerSelectEl = document.getElementById('speakerSelect');
-        const sendTextBtn = document.getElementById('sendText');
-        const recordAudioBtn = document.getElementById('recordAudio');
-        const clearContextBtn = document.getElementById('clearContext');
+        const streamButton = document.getElementById('streamButton');
+        const clearButton = document.getElementById('clearButton');
+        const statusDot = document.getElementById('statusDot');
+        const statusText = document.getElementById('statusText');
         
-        // Add streaming button to the input row
+        // Initialize on page load
         window.addEventListener('load', () => {
-            const inputRow = document.querySelector('.input-row:nth-child(2)');
-            streamButton = document.createElement('button');
-            streamButton.id = 'streamAudio';
-            streamButton.textContent = 'Start Streaming';
-            streamButton.addEventListener('click', toggleStreaming);
-            inputRow.appendChild(streamButton);
-            
             connectWebSocket();
-            setupRecording();
             setupAudioContext();
+            
+            // Event listeners
+            streamButton.addEventListener('click', toggleStreaming);
+            clearButton.addEventListener('click', clearConversation);
         });
         
         // Setup audio context for streaming
@@ -136,8 +180,68 @@
             }
         }
         
-        // Toggle audio streaming
-        async function toggleStreaming() {
+        // Connect to WebSocket server
+        function connectWebSocket() {
+            const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+            const wsUrl = `${wsProtocol}//${window.location.hostname}:8000/ws`;
+            
+            ws = new WebSocket(wsUrl);
+            
+            ws.onopen = () => {
+                console.log('WebSocket connected');
+                statusDot.classList.add('active');
+                statusText.textContent = 'Connected';
+                addSystemMessage('Connected to server');
+            };
+            
+            ws.onmessage = (event) => {
+                const response = JSON.parse(event.data);
+                console.log('Received:', response);
+                
+                if (response.type === 'audio_response') {
+                    // Play audio response
+                    const audio = new Audio(response.audio);
+                    audio.play();
+                    
+                    // Add message to conversation
+                    addAIMessage(response.text || 'AI response', response.audio);
+                    
+                    // Reset to speaking state after AI response
+                    if (isStreaming) {
+                        streamButton.textContent = 'Listening...';
+                        streamButton.style.backgroundColor = '#f44336'; // Back to red
+                        streamButton.classList.add('recording');
+                        isSpeaking = false; // Reset speaking state
+                    }
+                } else if (response.type === 'error') {
+                    addSystemMessage(`Error: ${response.message}`);
+                } else if (response.type === 'context_updated') {
+                    addSystemMessage(response.message);
+                } else if (response.type === 'streaming_status') {
+                    addSystemMessage(`Streaming ${response.status}`);
+                } else if (response.type === 'transcription') {
+                    addUserTranscription(response.text);
+                }
+            };
+            
+            ws.onclose = () => {
+                console.log('WebSocket disconnected');
+                statusDot.classList.remove('active');
+                statusText.textContent = 'Disconnected';
+                addSystemMessage('Disconnected from server. Reconnecting...');
+                setTimeout(connectWebSocket, 3000);
+            };
+            
+            ws.onerror = (error) => {
+                console.error('WebSocket error:', error);
+                statusDot.classList.remove('active');
+                statusText.textContent = 'Error';
+                addSystemMessage('Connection error');
+            };
+        }
+        
+        // Toggle streaming
+        function toggleStreaming() {
             if (isStreaming) {
                 stopStreaming();
             } else {
@@ -145,7 +249,7 @@
             }
         }
         
-        // Start audio streaming with silence detection
+        // Start streaming
         async function startStreaming() {
             try {
                 const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
@@ -155,7 +259,7 @@
                 isSpeaking = false;
                 energyWindow = [];
                 
-                streamButton.textContent = 'Speaking...';
+                streamButton.textContent = 'Listening...';
                 streamButton.classList.add('recording');
                 
                 // Create audio processor node
@@ -186,13 +290,13 @@
                 source.connect(streamProcessor);
                 streamProcessor.connect(audioContext.destination);
                 
-                addSystemMessage('Audio streaming started - speak naturally and pause when finished');
+                addSystemMessage('Listening - speak naturally and pause when finished');
                 
             } catch (err) {
                 console.error('Error starting audio stream:', err);
-                addSystemMessage(`Streaming error: ${err.message}`);
+                addSystemMessage(`Microphone error: ${err.message}`);
                 isStreaming = false;
-                streamButton.textContent = 'Start Streaming';
+                streamButton.textContent = 'Start Conversation';
                 streamButton.classList.remove('recording');
             }
         }
@@ -228,15 +332,17 @@
                     silenceTimer = setTimeout(() => {
                         // Silence persisted long enough
                         streamButton.textContent = 'Processing...';
-                        streamButton.style.backgroundColor = '#FFA500'; // Orange
+                        streamButton.classList.remove('recording');
+                        streamButton.classList.add('processing');
                         addSystemMessage('Detected pause in speech, processing response...');
                     }, CLIENT_SILENCE_DURATION_MS);
                 }
             } else if (!isSpeaking && !isSilent) {
                 // Transition from silence to speaking
                 isSpeaking = true;
-                streamButton.textContent = 'Speaking...';
-                streamButton.style.backgroundColor = '#f44336'; // Red
+                streamButton.textContent = 'Listening...';
+                streamButton.classList.add('recording');
+                streamButton.classList.remove('processing');
                 
                 // Clear any pending silence timer
                 if (silenceTimer) {
@@ -276,7 +382,7 @@
             reader.readAsDataURL(wavData);
         }
         
-        // Stop audio streaming
+        // Stop streaming
         function stopStreaming() {
             if (streamProcessor) {
                 streamProcessor.disconnect();
@@ -293,11 +399,11 @@
             isSpeaking = false;
             energyWindow = [];
             
-            streamButton.textContent = 'Start Streaming';
-            streamButton.classList.remove('recording');
+            streamButton.textContent = 'Start Conversation';
+            streamButton.classList.remove('recording', 'processing');
             streamButton.style.backgroundColor = ''; // Reset to default
             
-            addSystemMessage('Audio streaming stopped');
+            addSystemMessage('Conversation paused');
             
             // Send stop streaming signal to server
             ws.send(JSON.stringify({
@@ -306,6 +412,18 @@
             }));
         }
         
+        // Clear conversation
+        function clearConversation() {
+            // Clear conversation history
+            ws.send(JSON.stringify({
+                action: 'clear_context'
+            }));
+            
+            // Clear the UI
+            conversationEl.innerHTML = '';
+            addSystemMessage('Conversation cleared');
+        }
+        
         // Downsample audio buffer to target sample rate
         function downsampleBuffer(buffer, sampleRate, targetSampleRate) {
             if (targetSampleRate === sampleRate) {
@@ -376,212 +494,49 @@
             }
         }
         
-        // Connect to WebSocket
-        function connectWebSocket() {
-            const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-            const wsUrl = `${wsProtocol}//${window.location.hostname}:8000/ws`;
+        // Message display functions
+        function addUserTranscription(text) {
+            // Find if there's already a pending user message
+            let pendingMessage = document.querySelector('.message.user.pending');
             
-            ws = new WebSocket(wsUrl);
+            if (!pendingMessage) {
+                // Create a new message
+                pendingMessage = document.createElement('div');
+                pendingMessage.classList.add('message', 'user', 'pending');
+                conversationEl.appendChild(pendingMessage);
+            }
             
-            ws.onopen = () => {
-                console.log('WebSocket connected');
-                addSystemMessage('Connected to server');
-            };
-            
-            ws.onmessage = (event) => {
-                const response = JSON.parse(event.data);
-                console.log('Received:', response);
-                
-                if (response.type === 'audio_response') {
-                    // Play audio response
-                    const audio = new Audio(response.audio);
-                    audio.play();
-                    
-                    // Add message to conversation
-                    addAIMessage(response.audio);
-                    
-                    // Reset the streaming button if we're still in streaming mode
-                    if (isStreaming) {
-                        streamButton.textContent = 'Speaking...';
-                        streamButton.style.backgroundColor = '#f44336'; // Back to red
-                        isSpeaking = false; // Reset speaking state
-                    }
-                } else if (response.type === 'error') {
-                    addSystemMessage(`Error: ${response.message}`);
-                } else if (response.type === 'context_updated') {
-                    addSystemMessage(response.message);
-                } else if (response.type === 'streaming_status') {
-                    addSystemMessage(`Streaming ${response.status}`);
-                }
-            };
-            
-            ws.onclose = () => {
-                console.log('WebSocket disconnected');
-                addSystemMessage('Disconnected from server. Reconnecting...');
-                setTimeout(connectWebSocket, 3000);
-            };
-            
-            ws.onerror = (error) => {
-                console.error('WebSocket error:', error);
-                addSystemMessage('Connection error');
-            };
-        }
-        
-        // Add message to conversation
-        function addUserMessage(text) {
-            const messageEl = document.createElement('div');
-            messageEl.classList.add('message', 'user');
-            messageEl.textContent = text;
-            conversationEl.appendChild(messageEl);
+            pendingMessage.textContent = text;
+            pendingMessage.classList.remove('pending');
             conversationEl.scrollTop = conversationEl.scrollHeight;
         }
         
-        function addAIMessage(audioSrc) {
+        function addAIMessage(text, audioSrc) {
             const messageEl = document.createElement('div');
             messageEl.classList.add('message', 'ai');
             
+            if (text) {
+                const textDiv = document.createElement('div');
+                textDiv.textContent = text;
+                messageEl.appendChild(textDiv);
+            }
+            
             const audioEl = document.createElement('audio');
             audioEl.controls = true;
             audioEl.src = audioSrc;
-            
             messageEl.appendChild(audioEl);
+            
             conversationEl.appendChild(messageEl);
             conversationEl.scrollTop = conversationEl.scrollHeight;
         }
         
         function addSystemMessage(text) {
             const messageEl = document.createElement('div');
-            messageEl.classList.add('message');
+            messageEl.classList.add('message', 'system');
             messageEl.textContent = text;
             conversationEl.appendChild(messageEl);
             conversationEl.scrollTop = conversationEl.scrollHeight;
         }
-        
-        // Send text for audio generation
-        function sendTextForGeneration() {
-            const text = textInputEl.value.trim();
-            const speaker = parseInt(speakerSelectEl.value);
-            
-            if (!text) return;
-            
-            addUserMessage(text);
-            textInputEl.value = '';
-            
-            const request = {
-                action: 'generate',
-                text: text,
-                speaker: speaker
-            };
-            
-            ws.send(JSON.stringify(request));
-        }
-        
-        // Audio recording functions
-        async function setupRecording() {
-            try {
-                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-                
-                mediaRecorder = new MediaRecorder(stream);
-                
-                mediaRecorder.ondataavailable = (event) => {
-                    if (event.data.size > 0) {
-                        audioChunks.push(event.data);
-                    }
-                };
-                
-                mediaRecorder.onstop = async () => {
-                    const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
-                    const audioUrl = URL.createObjectURL(audioBlob);
-                    
-                    // Add audio to conversation
-                    addUserMessage('Recorded audio:');
-                    const messageEl = document.createElement('div');
-                    messageEl.classList.add('message', 'user');
-                    
-                    const audioEl = document.createElement('audio');
-                    audioEl.controls = true;
-                    audioEl.src = audioUrl;
-                    
-                    messageEl.appendChild(audioEl);
-                    conversationEl.appendChild(messageEl);
-                    
-                    // Convert to base64
-                    const reader = new FileReader();
-                    reader.readAsDataURL(audioBlob);
-                    reader.onloadend = () => {
-                        const base64Audio = reader.result;
-                        const text = textInputEl.value.trim() || "Recorded audio";
-                        const speaker = parseInt(speakerSelectEl.value);
-                        
-                        // Send to server
-                        const request = {
-                            action: 'add_to_context',
-                            text: text,
-                            speaker: speaker,
-                            audio: base64Audio
-                        };
-                        
-                        ws.send(JSON.stringify(request));
-                        textInputEl.value = '';
-                    };
-                    
-                    audioChunks = [];
-                    recordAudioBtn.textContent = 'Record Audio';
-                    recordAudioBtn.classList.remove('recording');
-                };
-                
-                console.log('Recording setup completed');
-                return true;
-            } catch (err) {
-                console.error('Error setting up recording:', err);
-                addSystemMessage(`Microphone access error: ${err.message}`);
-                return false;
-            }
-        }
-        
-        function toggleRecording() {
-            if (isRecording) {
-                mediaRecorder.stop();
-                isRecording = false;
-            } else {
-                if (!mediaRecorder) {
-                    setupRecording().then(success => {
-                        if (success) startRecording();
-                    });
-                } else {
-                    startRecording();
-                }
-            }
-        }
-        
-        function startRecording() {
-            audioChunks = [];
-            mediaRecorder.start();
-            isRecording = true;
-            recordAudioBtn.textContent = 'Stop Recording';
-            recordAudioBtn.classList.add('recording');
-        }
-        
-        // Event listeners
-        sendTextBtn.addEventListener('click', sendTextForGeneration);
-        
-        textInputEl.addEventListener('keypress', (e) => {
-            if (e.key === 'Enter') sendTextForGeneration();
-        });
-        
-        recordAudioBtn.addEventListener('click', toggleRecording);
-        
-        clearContextBtn.addEventListener('click', () => {
-            ws.send(JSON.stringify({
-                action: 'clear_context'
-            }));
-        });
-        
-        // Initialize
-        window.addEventListener('load', () => {
-            connectWebSocket();
-            setupRecording();
-        });
     </script>
 </body>
 </html>
\ No newline at end of file
diff --git a/Backend/server.py b/Backend/server.py
index bfdc590..b9736b5 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -5,6 +5,8 @@ import asyncio
 import torch
 import torchaudio
 import numpy as np
+import io
+import whisperx
 from io import BytesIO
 from typing import List, Dict, Any, Optional
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
@@ -13,6 +15,7 @@ from pydantic import BaseModel
 from generator import load_csm_1b, Segment
 import uvicorn
 import time
+import gc
 from collections import deque
 
 # Select device
@@ -25,6 +28,12 @@ print(f"Using device: {device}")
 # Initialize the model
 generator = load_csm_1b(device=device)
 
+# Initialize WhisperX for ASR
+print("Loading WhisperX model...")
+# Use a smaller model for faster response times
+asr_model = whisperx.load_model("medium", device, compute_type="float16")
+print("WhisperX model loaded!")
+
 app = FastAPI()
 
 # Add CORS middleware to allow cross-origin requests
@@ -93,6 +102,68 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
     return f"data:audio/wav;base64,{audio_base64}"
 
 
+async def transcribe_audio(audio_tensor: torch.Tensor) -> str:
+    """Transcribe audio using WhisperX"""
+    try:
+        # Save the tensor to a temporary file
+        temp_file = BytesIO()
+        torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
+        temp_file.seek(0)
+        
+        # Create a temporary file on disk (WhisperX requires a file path)
+        temp_path = "temp_audio.wav"
+        with open(temp_path, "wb") as f:
+            f.write(temp_file.read())
+        
+        # Load and transcribe the audio
+        audio = whisperx.load_audio(temp_path)
+        result = asr_model.transcribe(audio, batch_size=16)
+        
+        # Clean up
+        os.remove(temp_path)
+        
+        # Get the transcription text
+        if result["segments"] and len(result["segments"]) > 0:
+            # Combine all segments
+            transcription = " ".join([segment["text"] for segment in result["segments"]])
+            print(f"Transcription: {transcription}")
+            return transcription.strip()
+        else:
+            return ""
+    except Exception as e:
+        print(f"Error in transcription: {str(e)}")
+        return ""
+
+
+async def generate_response(text: str, conversation_history: List[Segment]) -> str:
+    """Generate a contextual response based on the transcribed text"""
+    # Simple response logic - can be replaced with a more sophisticated LLM in the future
+    responses = {
+        "hello": "Hello there! How are you doing today?",
+        "how are you": "I'm doing well, thanks for asking! How about you?",
+        "what is your name": "I'm Sesame, your voice assistant. How can I help you?",
+        "bye": "Goodbye! It was nice chatting with you.",
+        "thank you": "You're welcome! Is there anything else I can help with?",
+        "weather": "I don't have real-time weather data, but I hope it's nice where you are!",
+        "help": "I can chat with you using natural voice. Just speak normally and I'll respond.",
+    }
+    
+    text_lower = text.lower()
+    
+    # Check for matching keywords
+    for key, response in responses.items():
+        if key in text_lower:
+            return response
+    
+    # Default responses based on text length
+    if not text:
+        return "I didn't catch that. Could you please repeat?"
+    elif len(text) < 10:
+        return "Thanks for your message. Could you elaborate a bit more?"
+    else:
+        return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?"
+
+
 @app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
     await manager.connect(websocket)
@@ -220,30 +291,55 @@ async def websocket_endpoint(websocket: WebSocket):
                         # User has stopped talking - process the collected audio
                         full_audio = torch.cat(streaming_buffer, dim=0)
                         
-                        # Process with speech-to-text (you would need to implement this)
-                        # For now, just use a placeholder text
-                        text = f"User audio from speaker {speaker_id}"
+                        # Process with WhisperX speech-to-text
+                        transcribed_text = await transcribe_audio(full_audio)
                         
-                        print(f"Detected end of speech, processing {len(streaming_buffer)} chunks")
+                        # Log the transcription
+                        print(f"Transcribed text: '{transcribed_text}'")
                         
                         # Add to conversation context
-                        context_segments.append(Segment(text=text, speaker=speaker_id, audio=full_audio))
-                        
-                        # Generate response
-                        response_text = "This is a response to what you just said"
-                        audio_tensor = generator.generate(
-                            text=response_text,
-                            speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
-                            context=context_segments,
-                            max_audio_length_ms=10_000,
-                        )
-                        
-                        # Convert audio to base64 and send back to client
-                        audio_base64 = await encode_audio_data(audio_tensor)
-                        await websocket.send_json({
-                            "type": "audio_response",
-                            "audio": audio_base64
-                        })
+                        if transcribed_text:
+                            user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
+                            context_segments.append(user_segment)
+                            
+                            # Generate a contextual response
+                            response_text = await generate_response(transcribed_text, context_segments)
+                            
+                            # Send the transcribed text to client
+                            await websocket.send_json({
+                                "type": "transcription",
+                                "text": transcribed_text
+                            })
+                            
+                            # Generate audio for the response
+                            audio_tensor = generator.generate(
+                                text=response_text,
+                                speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
+                                context=context_segments,
+                                max_audio_length_ms=10_000,
+                            )
+                            
+                            # Add response to context
+                            ai_segment = Segment(
+                                text=response_text, 
+                                speaker=1 if speaker_id == 0 else 0, 
+                                audio=audio_tensor
+                            )
+                            context_segments.append(ai_segment)
+                            
+                            # Convert audio to base64 and send back to client
+                            audio_base64 = await encode_audio_data(audio_tensor)
+                            await websocket.send_json({
+                                "type": "audio_response",
+                                "text": response_text,
+                                "audio": audio_base64
+                            })
+                        else:
+                            # If transcription failed, send a generic response
+                            await websocket.send_json({
+                                "type": "error",
+                                "message": "Sorry, I couldn't understand what you said. Could you try again?"
+                            })
                         
                         # Clear buffer and reset silence detection
                         streaming_buffer = []
@@ -256,8 +352,19 @@ async def websocket_endpoint(websocket: WebSocket):
                     elif len(streaming_buffer) >= 30:  # ~6 seconds of audio at 5 chunks/sec
                         print("Buffer limit reached, processing audio")
                         full_audio = torch.cat(streaming_buffer, dim=0)
-                        text = f"Continued speech from speaker {speaker_id}"
-                        context_segments.append(Segment(text=text, speaker=speaker_id, audio=full_audio))
+                        
+                        # Process with WhisperX speech-to-text
+                        transcribed_text = await transcribe_audio(full_audio)
+                        
+                        if transcribed_text:
+                            context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio))
+                            
+                            # Send the transcribed text to client
+                            await websocket.send_json({
+                                "type": "transcription",
+                                "text": transcribed_text + " (processing continued speech...)"
+                            })
+                        
                         streaming_buffer = []
                         
                 except Exception as e:
@@ -269,11 +376,21 @@ async def websocket_endpoint(websocket: WebSocket):
             
             elif action == "stop_streaming":
                 is_streaming = False
-                if streaming_buffer:
+                if streaming_buffer and len(streaming_buffer) > 5:  # Only process if there's meaningful audio
                     # Process any remaining audio in the buffer
                     full_audio = torch.cat(streaming_buffer, dim=0)
-                    text = f"Final streaming audio from speaker {request.get('speaker', 0)}"
-                    context_segments.append(Segment(text=text, speaker=request.get("speaker", 0), audio=full_audio))
+                    
+                    # Process with WhisperX speech-to-text
+                    transcribed_text = await transcribe_audio(full_audio)
+                    
+                    if transcribed_text:
+                        context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio))
+                        
+                        # Send the transcribed text to client
+                        await websocket.send_json({
+                            "type": "transcription",
+                            "text": transcribed_text
+                        })
                 
                 streaming_buffer = []
                 await websocket.send_json({
@@ -286,12 +403,15 @@ async def websocket_endpoint(websocket: WebSocket):
         print("Client disconnected")
     except Exception as e:
         print(f"Error: {str(e)}")
-        await websocket.send_json({
-            "type": "error",
-            "message": str(e)
-        })
+        try:
+            await websocket.send_json({
+                "type": "error",
+                "message": str(e)
+            })
+        except:
+            pass
         manager.disconnect(websocket)
 
 
 if __name__ == "__main__":
-    uvicorn.run(app, host="localhost", port=8000)
\ No newline at end of file
+    uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file

From fd1ac0a0d73ea4fc7db66dad7f0ba0584c7d9baa Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 22:14:45 -0400
Subject: [PATCH 02/16] Client side Voice Visualizer

---
 Backend/index.html | 141 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 137 insertions(+), 4 deletions(-)

diff --git a/Backend/index.html b/Backend/index.html
index f4ff6a0..7ab431f 100644
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -118,12 +118,45 @@
             width: 100%;
             margin-top: 5px;
         }
+        .visualizer-container {
+            width: 100%;
+            height: 60px;
+            background-color: #f5f5f5;
+            border-radius: 12px;
+            margin-bottom: 15px;
+            overflow: hidden;
+            position: relative;
+        }
+        
+        .audio-visualizer {
+            width: 100%;
+            height: 100%;
+            display: block;
+        }
+        
+        .visualizer-label {
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            color: #999;
+            font-size: 0.9em;
+            pointer-events: none;
+            opacity: 0.7;
+            text-align: center;
+            width: 100%;
+        }
     </style>
 </head>
 <body>
     <h1>Sesame AI Voice Chat</h1>
     <div class="conversation" id="conversation"></div>
     
+    <div class="visualizer-container">
+        <canvas id="audioVisualizer" class="audio-visualizer"></canvas>
+        <div id="visualizerLabel" class="visualizer-label">Audio levels will appear here when speaking</div>
+    </div>
+    
     <div class="controls">
         <select id="speakerSelect">
             <option value="0">Speaker 0</option>
@@ -151,6 +184,15 @@
         const CLIENT_SILENCE_THRESHOLD = 0.01;
         const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second
         
+        // Add these variables with your existing ones
+        let analyser;
+        let visualizerCanvas;
+        let canvasContext;
+        let visualizerBufferLength;
+        let visualizerDataArray;
+        let visualizerAnimationFrame;
+        const visualizerLabel = document.getElementById('visualizerLabel');
+        
         // DOM elements
         const conversationEl = document.getElementById('conversation');
         const speakerSelectEl = document.getElementById('speakerSelect');
@@ -163,6 +205,7 @@
         window.addEventListener('load', () => {
             connectWebSocket();
             setupAudioContext();
+            setupVisualizer();
             
             // Event listeners
             streamButton.addEventListener('click', toggleStreaming);
@@ -264,8 +307,27 @@
                 
                 // Create audio processor node
                 const source = audioContext.createMediaStreamSource(stream);
+                
+                // Set up analyser for visualization
+                analyser = audioContext.createAnalyser();
+                analyser.fftSize = 256;
+                visualizerBufferLength = analyser.frequencyBinCount;
+                visualizerDataArray = new Uint8Array(visualizerBufferLength);
+                source.connect(analyser);
+                
+                // Hide the label when visualization is active
+                visualizerLabel.style.opacity = '0';
+                
+                // Start drawing the visualization
+                drawVisualizer();
+                
+                // Set up processor for audio processing
                 streamProcessor = audioContext.createScriptProcessor(4096, 1, 1);
                 
+                // Connect nodes
+                source.connect(streamProcessor);
+                streamProcessor.connect(audioContext.destination);
+                
                 // Process and send audio data
                 streamProcessor.onaudioprocess = function(e) {
                     const audioData = e.inputBuffer.getChannelData(0);
@@ -286,10 +348,6 @@
                     sendAudioChunk(downsampled, speaker);
                 };
                 
-                // Connect the nodes
-                source.connect(streamProcessor);
-                streamProcessor.connect(audioContext.destination);
-                
                 addSystemMessage('Listening - speak naturally and pause when finished');
                 
             } catch (err) {
@@ -389,6 +447,23 @@
                 streamProcessor = null;
             }
             
+            if (analyser) {
+                analyser.disconnect();
+                analyser = null;
+            }
+            
+            // Stop the visualization
+            if (visualizerAnimationFrame) {
+                cancelAnimationFrame(visualizerAnimationFrame);
+                visualizerAnimationFrame = null;
+            }
+            
+            // Clear the canvas
+            if (canvasContext) {
+                canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+                visualizerLabel.style.opacity = '0.7';
+            }
+            
             // Clear any pending silence timer
             if (silenceTimer) {
                 clearTimeout(silenceTimer);
@@ -537,6 +612,64 @@
             conversationEl.appendChild(messageEl);
             conversationEl.scrollTop = conversationEl.scrollHeight;
         }
+        
+        // Setup the audio visualizer
+        function setupVisualizer() {
+            visualizerCanvas = document.getElementById('audioVisualizer');
+            canvasContext = visualizerCanvas.getContext('2d');
+            
+            // Set canvas size to match container
+            function resizeCanvas() {
+                const container = visualizerCanvas.parentElement;
+                visualizerCanvas.width = container.clientWidth;
+                visualizerCanvas.height = container.clientHeight;
+            }
+            
+            // Call initially and on window resize
+            resizeCanvas();
+            window.addEventListener('resize', resizeCanvas);
+        }
+        
+        // Add the visualization drawing function
+        function drawVisualizer() {
+            if (!isStreaming) {
+                if (visualizerAnimationFrame) {
+                    cancelAnimationFrame(visualizerAnimationFrame);
+                    visualizerAnimationFrame = null;
+                }
+                
+                // Clear the canvas
+                canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+                visualizerLabel.style.opacity = '0.7';
+                return;
+            }
+            
+            visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
+            
+            // Get the frequency data
+            analyser.getByteFrequencyData(visualizerDataArray);
+            
+            // Clear the canvas
+            canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+            
+            // Calculate bar width based on canvas size and buffer length
+            const barWidth = (visualizerCanvas.width / visualizerBufferLength) * 2.5;
+            let barHeight;
+            let x = 0;
+            
+            // Draw bars
+            for (let i = 0; i < visualizerBufferLength; i++) {
+                barHeight = visualizerDataArray[i] / 2; // Scale down to fit in canvas
+                
+                // Use a gradient color based on frequency intensity
+                const hue = i / visualizerBufferLength * 180 + 180; // Blue to green spectrum
+                canvasContext.fillStyle = `hsl(${hue}, 100%, ${50 + (barHeight / 2)}%)`;
+                
+                canvasContext.fillRect(x, visualizerCanvas.height - barHeight, barWidth, barHeight);
+                
+                x += barWidth + 1;
+            }
+        }
     </script>
 </body>
 </html>
\ No newline at end of file

From da6038f2b2e961b0b8ed19fdb6c6703a7687a1a5 Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 22:18:56 -0400
Subject: [PATCH 03/16] Audio Visualizer Update

---
 Backend/index.html | 91 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 67 insertions(+), 24 deletions(-)

diff --git a/Backend/index.html b/Backend/index.html
index 7ab431f..0e4006e 100644
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -308,17 +308,26 @@
                 // Create audio processor node
                 const source = audioContext.createMediaStreamSource(stream);
                 
-                // Set up analyser for visualization
+                // Set up analyser for visualization with better settings
                 analyser = audioContext.createAnalyser();
                 analyser.fftSize = 256;
+                analyser.smoothingTimeConstant = 0.8; // Add smoothing for nicer visualization
+                analyser.minDecibels = -90;
+                analyser.maxDecibels = -10;
+                
                 visualizerBufferLength = analyser.frequencyBinCount;
                 visualizerDataArray = new Uint8Array(visualizerBufferLength);
+                
+                // Connect source to analyzer first
                 source.connect(analyser);
                 
                 // Hide the label when visualization is active
                 visualizerLabel.style.opacity = '0';
                 
                 // Start drawing the visualization
+                if (visualizerAnimationFrame) {
+                    cancelAnimationFrame(visualizerAnimationFrame);
+                }
                 drawVisualizer();
                 
                 // Set up processor for audio processing
@@ -628,47 +637,81 @@
             // Call initially and on window resize
             resizeCanvas();
             window.addEventListener('resize', resizeCanvas);
+            
+            // Create placeholder data array (will be used before streaming starts)
+            visualizerBufferLength = 128; // Default size
+            visualizerDataArray = new Uint8Array(visualizerBufferLength);
         }
         
         // Add the visualization drawing function
         function drawVisualizer() {
-            if (!isStreaming) {
-                if (visualizerAnimationFrame) {
-                    cancelAnimationFrame(visualizerAnimationFrame);
-                    visualizerAnimationFrame = null;
-                }
-                
-                // Clear the canvas
-                canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
-                visualizerLabel.style.opacity = '0.7';
+            // Ensure we have the canvas context
+            if (!canvasContext) {
+                console.error("Canvas context not available");
                 return;
             }
             
             visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
             
-            // Get the frequency data
-            analyser.getByteFrequencyData(visualizerDataArray);
+            // If we're streaming and have an analyzer, get the frequency data
+            if (isStreaming && analyser) {
+                try {
+                    analyser.getByteFrequencyData(visualizerDataArray);
+                } catch (e) {
+                    console.error("Error getting frequency data:", e);
+                }
+            } else {
+                // If not streaming, gradually reduce all values to create a fade-out effect
+                for (let i = 0; i < visualizerDataArray.length; i++) {
+                    visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
+                }
+            }
             
-            // Clear the canvas
-            canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+            // Clear the canvas with a very slight background
+            canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
+            canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
             
             // Calculate bar width based on canvas size and buffer length
-            const barWidth = (visualizerCanvas.width / visualizerBufferLength) * 2.5;
-            let barHeight;
-            let x = 0;
+            const width = visualizerCanvas.width;
+            const height = visualizerCanvas.height;
+            const barCount = Math.min(visualizerBufferLength, 64); // Limit bars for performance
+            const barWidth = width / barCount - 1;  // Leave 1px gap
             
             // Draw bars
-            for (let i = 0; i < visualizerBufferLength; i++) {
-                barHeight = visualizerDataArray[i] / 2; // Scale down to fit in canvas
+            for (let i = 0; i < barCount; i++) {
+                // Use a logarithmic scale for better visualization of lower frequencies
+                const index = Math.floor(i * visualizerBufferLength / barCount);
+                const value = visualizerDataArray[index];
                 
-                // Use a gradient color based on frequency intensity
-                const hue = i / visualizerBufferLength * 180 + 180; // Blue to green spectrum
-                canvasContext.fillStyle = `hsl(${hue}, 100%, ${50 + (barHeight / 2)}%)`;
+                // Scale height (values typically range from 0-255)
+                const barHeight = (value / 255) * height;
                 
-                canvasContext.fillRect(x, visualizerCanvas.height - barHeight, barWidth, barHeight);
+                // Position x coordinate
+                const x = i * (barWidth + 1);
                 
-                x += barWidth + 1;
+                // Calculate gradient color based on frequency
+                const hue = 200 + (i / barCount * 60); // Blue to light-blue/cyan spectrum
+                const saturation = 90 - (value / 255 * 30); // More saturated for louder sounds
+                const lightness = 40 + (value / 255 * 30); // Brighter for louder sounds
+                
+                // Draw the bar
+                canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
+                canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
+                
+                // Add a subtle reflection
+                const gradientHeight = Math.min(10, barHeight / 3);
+                const gradient = canvasContext.createLinearGradient(
+                    0, height - barHeight, 
+                    0, height - barHeight + gradientHeight
+                );
+                gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
+                gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
+                canvasContext.fillStyle = gradient;
+                canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
             }
+            
+            // Only show the label when not streaming
+            visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
         }
     </script>
 </body>

From e08f7a2c1c265a985b627357a096d58778ac57eb Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 22:27:44 -0400
Subject: [PATCH 04/16] Server API and Webpage update

---
 Backend/server.py | 127 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 123 insertions(+), 4 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index b9736b5..97b346b 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -9,7 +9,9 @@ import io
 import whisperx
 from io import BytesIO
 from typing import List, Dict, Any, Optional
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
+from fastapi.responses import HTMLResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from generator import load_csm_1b, Segment
@@ -17,6 +19,8 @@ import uvicorn
 import time
 import gc
 from collections import deque
+import socket
+import requests
 
 # Select device
 if torch.cuda.is_available():
@@ -45,6 +49,32 @@ app.add_middleware(
     allow_headers=["*"],
 )
 
+# Define the base directory
+base_dir = os.path.dirname(os.path.abspath(__file__))
+
+# Mount a static files directory if you have any static assets like CSS or JS
+static_dir = os.path.join(base_dir, "static")
+os.makedirs(static_dir, exist_ok=True)  # Create the directory if it doesn't exist
+app.mount("/static", StaticFiles(directory=static_dir), name="static")
+
+# Define route to serve index.html as the main page
+@app.get("/", response_class=HTMLResponse)
+async def get_index():
+    try:
+        with open(os.path.join(base_dir, "index.html"), "r") as f:
+            return HTMLResponse(content=f.read())
+    except FileNotFoundError:
+        return HTMLResponse(content="<html><body><h1>Error: index.html not found</h1></body></html>")
+
+# Add a favicon endpoint (optional, but good to have)
+@app.get("/favicon.ico")
+async def get_favicon():
+    favicon_path = os.path.join(static_dir, "favicon.ico")
+    if os.path.exists(favicon_path):
+        return FileResponse(favicon_path)
+    else:
+        return HTMLResponse(status_code=204)  # No content
+
 # Connection manager to handle multiple clients
 class ConnectionManager:
     def __init__(self):
@@ -259,6 +289,7 @@ async def websocket_endpoint(websocket: WebSocket):
                         energy_window.clear()
                         is_silence = False
                         last_active_time = time.time()
+                        print(f"Streaming started with speaker ID: {speaker_id}")
                         await websocket.send_json({
                             "type": "streaming_status",
                             "status": "started"
@@ -269,6 +300,13 @@ async def websocket_endpoint(websocket: WebSocket):
                     energy_window.append(chunk_energy)
                     avg_energy = sum(energy_window) / len(energy_window)
                     
+                    # Debug audio levels
+                    if len(energy_window) >= 5:  # Only start printing after we have enough samples
+                        if avg_energy > SILENCE_THRESHOLD:
+                            print(f"[AUDIO] Active sound detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
+                        else:
+                            print(f"[AUDIO] Silence detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
+                    
                     # Check if audio is silent
                     current_silence = avg_energy < SILENCE_THRESHOLD
                     
@@ -277,33 +315,53 @@ async def websocket_endpoint(websocket: WebSocket):
                         # Transition to silence
                         is_silence = True
                         last_active_time = time.time()
+                        print("[STREAM] Transition to silence detected")
                     elif is_silence and not current_silence:
                         # User started talking again
                         is_silence = False
+                        print("[STREAM] User resumed speaking")
                     
                     # Add chunk to buffer regardless of silence state
                     streaming_buffer.append(audio_chunk)
                     
+                    # Debug buffer size periodically
+                    if len(streaming_buffer) % 10 == 0:
+                        print(f"[BUFFER] Current size: {len(streaming_buffer)} chunks, ~{len(streaming_buffer)/5:.1f} seconds")
+                        
                     # Check if silence has persisted long enough to consider "stopped talking"
                     silence_elapsed = time.time() - last_active_time
                     
                     if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0:
                         # User has stopped talking - process the collected audio
+                        print(f"[STREAM] Processing audio after {silence_elapsed:.2f}s of silence")
+                        print(f"[STREAM] Processing {len(streaming_buffer)} audio chunks (~{len(streaming_buffer)/5:.1f} seconds)")
+                        
                         full_audio = torch.cat(streaming_buffer, dim=0)
                         
+                        # Log audio statistics
+                        audio_duration = len(full_audio) / generator.sample_rate
+                        audio_min = torch.min(full_audio).item()
+                        audio_max = torch.max(full_audio).item()
+                        audio_mean = torch.mean(full_audio).item()
+                        print(f"[AUDIO] Processed audio - Duration: {audio_duration:.2f}s, Min: {audio_min:.4f}, Max: {audio_max:.4f}, Mean: {audio_mean:.4f}")
+                        
                         # Process with WhisperX speech-to-text
+                        print("[ASR] Starting transcription with WhisperX...")
                         transcribed_text = await transcribe_audio(full_audio)
                         
                         # Log the transcription
-                        print(f"Transcribed text: '{transcribed_text}'")
+                        print(f"[ASR] Transcribed text: '{transcribed_text}'")
                         
                         # Add to conversation context
                         if transcribed_text:
+                            print(f"[DIALOG] Adding user utterance to context: '{transcribed_text}'")
                             user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
                             context_segments.append(user_segment)
                             
                             # Generate a contextual response
+                            print("[DIALOG] Generating response...")
                             response_text = await generate_response(transcribed_text, context_segments)
+                            print(f"[DIALOG] Response text: '{response_text}'")
                             
                             # Send the transcribed text to client
                             await websocket.send_json({
@@ -312,12 +370,14 @@ async def websocket_endpoint(websocket: WebSocket):
                             })
                             
                             # Generate audio for the response
+                            print("[TTS] Generating speech for response...")
                             audio_tensor = generator.generate(
                                 text=response_text,
                                 speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
                                 context=context_segments,
                                 max_audio_length_ms=10_000,
                             )
+                            print(f"[TTS] Generated audio length: {len(audio_tensor)/generator.sample_rate:.2f}s")
                             
                             # Add response to context
                             ai_segment = Segment(
@@ -326,15 +386,18 @@ async def websocket_endpoint(websocket: WebSocket):
                                 audio=audio_tensor
                             )
                             context_segments.append(ai_segment)
+                            print(f"[DIALOG] Context now has {len(context_segments)} segments")
                             
                             # Convert audio to base64 and send back to client
                             audio_base64 = await encode_audio_data(audio_tensor)
+                            print("[STREAM] Sending audio response to client")
                             await websocket.send_json({
                                 "type": "audio_response",
                                 "text": response_text,
                                 "audio": audio_base64
                             })
                         else:
+                            print("[ASR] Transcription failed or returned empty text")
                             # If transcription failed, send a generic response
                             await websocket.send_json({
                                 "type": "error",
@@ -346,17 +409,20 @@ async def websocket_endpoint(websocket: WebSocket):
                         energy_window.clear()
                         is_silence = False
                         last_active_time = time.time()
+                        print("[STREAM] Buffer cleared, ready for next utterance")
                     
                     # If buffer gets too large without silence, process it anyway
                     # This prevents memory issues with very long streams
                     elif len(streaming_buffer) >= 30:  # ~6 seconds of audio at 5 chunks/sec
-                        print("Buffer limit reached, processing audio")
+                        print("[BUFFER] Maximum buffer size reached, processing audio")
                         full_audio = torch.cat(streaming_buffer, dim=0)
                         
                         # Process with WhisperX speech-to-text
+                        print("[ASR] Starting forced transcription of long audio...")
                         transcribed_text = await transcribe_audio(full_audio)
                         
                         if transcribed_text:
+                            print(f"[ASR] Transcribed long audio: '{transcribed_text}'")
                             context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio))
                             
                             # Send the transcribed text to client
@@ -364,11 +430,17 @@ async def websocket_endpoint(websocket: WebSocket):
                                 "type": "transcription",
                                 "text": transcribed_text + " (processing continued speech...)"
                             })
+                        else:
+                            print("[ASR] No transcription from long audio")
                         
                         streaming_buffer = []
+                        print("[BUFFER] Buffer cleared due to size limit")
                         
                 except Exception as e:
-                    print(f"Error processing streaming audio: {str(e)}")
+                    print(f"[ERROR] Processing streaming audio: {str(e)}")
+                    # Print traceback for more detailed error information
+                    import traceback
+                    traceback.print_exc()
                     await websocket.send_json({
                         "type": "error",
                         "message": f"Error processing streaming audio: {str(e)}"
@@ -412,6 +484,53 @@ async def websocket_endpoint(websocket: WebSocket):
             pass
         manager.disconnect(websocket)
 
+# Add this function to get the public IP address
+def get_public_ip():
+    """Get the server's public IP address using an external service"""
+    try:
+        # Try multiple services in case one is down
+        services = [
+            "https://api.ipify.org",
+            "https://ifconfig.me/ip",
+            "https://checkip.amazonaws.com",
+        ]
+        
+        for service in services:
+            try:
+                response = requests.get(service, timeout=3)
+                if response.status_code == 200:
+                    return response.text.strip()
+            except:
+                continue
+                
+        # Fallback to socket if external services fail
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        try:
+            # Doesn't need to be reachable, just used to determine interface
+            s.connect(('8.8.8.8', 1))
+            local_ip = s.getsockname()[0]
+            return local_ip
+        except:
+            return "localhost"
+        finally:
+            s.close()
+    except:
+        return "Could not determine IP address"
 
+# Update the __main__ block
 if __name__ == "__main__":
+    public_ip = get_public_ip()
+    print(f"\n{'='*50}")
+    print(f"💬 Sesame AI Voice Chat Server")
+    print(f"{'='*50}")
+    print(f"📡 Server Information:")
+    print(f"   - Public IP: http://{public_ip}:8000")
+    print(f"   - Local URL: http://localhost:8000")
+    print(f"   - WebSocket: ws://{public_ip}:8000/ws")
+    print(f"{'='*50}")
+    print(f"🌐 Connect from web browsers using: http://{public_ip}:8000")
+    print(f"🔧 Serving index.html from: {os.path.join(base_dir, 'index.html')}")
+    print(f"{'='*50}\n")
+    
+    # Start the server
     uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file

From e1f976eaca156ed00d2659f61bfc93018128aaba Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 22:37:27 -0400
Subject: [PATCH 05/16] Server info print update

---
 Backend/server.py | 63 +++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 46 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index 97b346b..f159025 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -19,8 +19,6 @@ import uvicorn
 import time
 import gc
 from collections import deque
-import socket
-import requests
 
 # Select device
 if torch.cuda.is_available():
@@ -484,53 +482,26 @@ async def websocket_endpoint(websocket: WebSocket):
             pass
         manager.disconnect(websocket)
 
-# Add this function to get the public IP address
-def get_public_ip():
-    """Get the server's public IP address using an external service"""
-    try:
-        # Try multiple services in case one is down
-        services = [
-            "https://api.ipify.org",
-            "https://ifconfig.me/ip",
-            "https://checkip.amazonaws.com",
-        ]
-        
-        for service in services:
-            try:
-                response = requests.get(service, timeout=3)
-                if response.status_code == 200:
-                    return response.text.strip()
-            except:
-                continue
-                
-        # Fallback to socket if external services fail
-        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-        try:
-            # Doesn't need to be reachable, just used to determine interface
-            s.connect(('8.8.8.8', 1))
-            local_ip = s.getsockname()[0]
-            return local_ip
-        except:
-            return "localhost"
-        finally:
-            s.close()
-    except:
-        return "Could not determine IP address"
-
-# Update the __main__ block
+# Update the __main__ block with a comprehensive server startup message
 if __name__ == "__main__":
-    public_ip = get_public_ip()
-    print(f"\n{'='*50}")
-    print(f"💬 Sesame AI Voice Chat Server")
-    print(f"{'='*50}")
+    print(f"\n{'='*60}")
+    print(f"🔊 Sesame AI Voice Chat Server")
+    print(f"{'='*60}")
     print(f"📡 Server Information:")
-    print(f"   - Public IP: http://{public_ip}:8000")
     print(f"   - Local URL: http://localhost:8000")
-    print(f"   - WebSocket: ws://{public_ip}:8000/ws")
-    print(f"{'='*50}")
-    print(f"🌐 Connect from web browsers using: http://{public_ip}:8000")
-    print(f"🔧 Serving index.html from: {os.path.join(base_dir, 'index.html')}")
-    print(f"{'='*50}\n")
+    print(f"   - Network URL: http://<your-ip-address>:8000")
+    print(f"   - WebSocket: ws://<your-ip-address>:8000/ws")
+    print(f"{'='*60}")
+    print(f"💡 To make this server public:")
+    print(f"   1. Ensure port 8000 is open in your firewall")
+    print(f"   2. Set up port forwarding on your router to port 8000")
+    print(f"   3. Or use a service like ngrok with: ngrok http 8000")
+    print(f"{'='*60}")
+    print(f"🌐 Device: {device.upper()}")
+    print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})")
+    print(f"🔧 Serving from: {os.path.join(base_dir, 'index.html')}")
+    print(f"{'='*60}")
+    print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")
     
     # Start the server
     uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file

From 08fec9c403695598749fc27f6816a3351728230f Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 22:48:24 -0400
Subject: [PATCH 06/16] Server and Client Side update

---
 Backend/index.html | 555 ++++++++++++++++++++---------------
 Backend/server.py  | 707 +++++++++++++++++++++------------------------
 2 files changed, 655 insertions(+), 607 deletions(-)

diff --git a/Backend/index.html b/Backend/index.html
index 0e4006e..2944700 100644
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -1,9 +1,13 @@
+/Backend/index.html -->
 <!DOCTYPE html>
 <html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Sesame AI Voice Chat</title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
+    <!-- Socket.IO client library -->
+    <script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
     <style>
         body {
             font-family: 'Arial', sans-serif;
@@ -11,6 +15,12 @@
             margin: 0 auto;
             padding: 20px;
             background-color: #f9f9f9;
+            color: #333;
+        }
+        h1 {
+            text-align: center;
+            margin-bottom: 20px;
+            color: #1a73e8;
         }
         .conversation {
             border: 1px solid #ddd;
@@ -21,6 +31,7 @@
             margin-bottom: 20px;
             background-color: white;
             box-shadow: 0 2px 10px rgba(0,0,0,0.05);
+            scroll-behavior: smooth;
         }
         .message {
             margin-bottom: 15px;
@@ -28,6 +39,7 @@
             border-radius: 12px;
             max-width: 80%;
             line-height: 1.4;
+            animation: message-appear 0.3s ease-out;
         }
         .user {
             background-color: #e3f2fd;
@@ -55,6 +67,7 @@
             gap: 15px;
             justify-content: center;
             align-items: center;
+            margin-bottom: 15px;
         }
         button {
             padding: 12px 24px;
@@ -66,11 +79,20 @@
             font-weight: bold;
             transition: all 0.2s ease;
             box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            gap: 8px;
         }
         button:hover {
             background-color: #45a049;
             box-shadow: 0 4px 8px rgba(0,0,0,0.15);
         }
+        button:disabled {
+            background-color: #cccccc;
+            cursor: not-allowed;
+            opacity: 0.7;
+        }
         .recording {
             background-color: #f44336;
             animation: pulse 1.5s infinite;
@@ -94,6 +116,10 @@
             50% { opacity: 0.7; }
             100% { opacity: 1; }
         }
+        @keyframes message-appear {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
         .status-indicator {
             display: flex;
             align-items: center;
@@ -106,6 +132,7 @@
             height: 10px;
             border-radius: 50%;
             background-color: #ccc;
+            transition: background-color 0.3s ease;
         }
         .status-dot.active {
             background-color: #4CAF50;
@@ -117,6 +144,7 @@
         audio {
             width: 100%;
             margin-top: 5px;
+            border-radius: 8px;
         }
         .visualizer-container {
             width: 100%;
@@ -126,14 +154,13 @@
             margin-bottom: 15px;
             overflow: hidden;
             position: relative;
+            box-shadow: inset 0 1px 3px rgba(0,0,0,0.1);
         }
-        
         .audio-visualizer {
             width: 100%;
             height: 100%;
             display: block;
         }
-        
         .visualizer-label {
             position: absolute;
             top: 50%;
@@ -145,6 +172,21 @@
             opacity: 0.7;
             text-align: center;
             width: 100%;
+            transition: opacity 0.3s ease;
+        }
+        .conversation::-webkit-scrollbar {
+            width: 8px;
+        }
+        .conversation::-webkit-scrollbar-track {
+            background: #f1f1f1;
+            border-radius: 10px;
+        }
+        .conversation::-webkit-scrollbar-thumb {
+            background: #ccc;
+            border-radius: 10px;
+        }
+        .conversation::-webkit-scrollbar-thumb:hover {
+            background: #aaa;
         }
     </style>
 </head>
@@ -162,8 +204,8 @@
             <option value="0">Speaker 0</option>
             <option value="1">Speaker 1</option>
         </select>
-        <button id="streamButton">Start Conversation</button>
-        <button id="clearButton">Clear Chat</button>
+        <button id="streamButton"><i class="fas fa-microphone"></i> Start Conversation</button>
+        <button id="clearButton"><i class="fas fa-trash"></i> Clear Chat</button>
     </div>
     
     <div class="status-indicator">
@@ -173,7 +215,7 @@
 
     <script>
         // Variables
-        let ws;
+        let socket;
         let audioContext;
         let streamProcessor;
         let isStreaming = false;
@@ -184,14 +226,13 @@
         const CLIENT_SILENCE_THRESHOLD = 0.01;
         const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second
         
-        // Add these variables with your existing ones
+        // Visualizer variables
         let analyser;
         let visualizerCanvas;
         let canvasContext;
         let visualizerBufferLength;
         let visualizerDataArray;
         let visualizerAnimationFrame;
-        const visualizerLabel = document.getElementById('visualizerLabel');
         
         // DOM elements
         const conversationEl = document.getElementById('conversation');
@@ -200,93 +241,150 @@
         const clearButton = document.getElementById('clearButton');
         const statusDot = document.getElementById('statusDot');
         const statusText = document.getElementById('statusText');
+        const visualizerLabel = document.getElementById('visualizerLabel');
         
         // Initialize on page load
         window.addEventListener('load', () => {
-            connectWebSocket();
+            // Initialize audio context
             setupAudioContext();
+            
+            // Setup visualization 
             setupVisualizer();
             
-            // Event listeners
+            // Connect to Socket.IO server
+            connectSocketIO();
+            
+            // Add event listeners
             streamButton.addEventListener('click', toggleStreaming);
             clearButton.addEventListener('click', clearConversation);
         });
         
-        // Setup audio context for streaming
+        // Setup audio context
         function setupAudioContext() {
             try {
                 audioContext = new (window.AudioContext || window.webkitAudioContext)();
-                console.log('Audio context setup completed');
+                console.log('Audio context initialized');
             } catch (err) {
                 console.error('Error setting up audio context:', err);
                 addSystemMessage(`Audio context error: ${err.message}`);
+                streamButton.disabled = true;
             }
         }
         
-        // Connect to WebSocket server
-        function connectWebSocket() {
-            const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-            const wsUrl = `${wsProtocol}//${window.location.hostname}:8000/ws`;
+        // Setup the audio visualizer
+        function setupVisualizer() {
+            visualizerCanvas = document.getElementById('audioVisualizer');
+            canvasContext = visualizerCanvas.getContext('2d');
             
-            ws = new WebSocket(wsUrl);
+            // Set canvas size to match container
+            function resizeCanvas() {
+                const container = visualizerCanvas.parentElement;
+                visualizerCanvas.width = container.clientWidth;
+                visualizerCanvas.height = container.clientHeight;
+            }
             
-            ws.onopen = () => {
-                console.log('WebSocket connected');
+            // Call initially and on window resize
+            resizeCanvas();
+            window.addEventListener('resize', resizeCanvas);
+            
+            // Create placeholder data array
+            visualizerBufferLength = 128;
+            visualizerDataArray = new Uint8Array(visualizerBufferLength);
+        }
+        
+        // Connect to Socket.IO server
+        function connectSocketIO() {
+            // Use the server URL with or without a specific port
+            const serverUrl = window.location.origin;
+            
+            console.log(`Connecting to Socket.IO server at ${serverUrl}`);
+            socket = io(serverUrl, {
+                reconnectionDelay: 1000,
+                reconnectionDelayMax: 5000,
+                reconnectionAttempts: Infinity
+            });
+            
+            // Socket.IO event handlers
+            socket.on('connect', () => {
+                console.log('Connected to Socket.IO server');
                 statusDot.classList.add('active');
                 statusText.textContent = 'Connected';
                 addSystemMessage('Connected to server');
-            };
+                streamButton.disabled = false;
+            });
             
-            ws.onmessage = (event) => {
-                const response = JSON.parse(event.data);
-                console.log('Received:', response);
-                
-                if (response.type === 'audio_response') {
-                    // Play audio response
-                    const audio = new Audio(response.audio);
-                    audio.play();
-                    
-                    // Add message to conversation
-                    addAIMessage(response.text || 'AI response', response.audio);
-                    
-                    // Reset to speaking state after AI response
-                    if (isStreaming) {
-                        streamButton.textContent = 'Listening...';
-                        streamButton.style.backgroundColor = '#f44336'; // Back to red
-                        streamButton.classList.add('recording');
-                        isSpeaking = false; // Reset speaking state
-                    }
-                } else if (response.type === 'error') {
-                    addSystemMessage(`Error: ${response.message}`);
-                } else if (response.type === 'context_updated') {
-                    addSystemMessage(response.message);
-                } else if (response.type === 'streaming_status') {
-                    addSystemMessage(`Streaming ${response.status}`);
-                } else if (response.type === 'transcription') {
-                    addUserTranscription(response.text);
-                }
-            };
-            
-            ws.onclose = () => {
-                console.log('WebSocket disconnected');
+            socket.on('disconnect', () => {
+                console.log('Disconnected from Socket.IO server');
                 statusDot.classList.remove('active');
                 statusText.textContent = 'Disconnected';
-                addSystemMessage('Disconnected from server. Reconnecting...');
-                setTimeout(connectWebSocket, 3000);
-            };
+                addSystemMessage('Disconnected from server');
+                streamButton.disabled = true;
+                
+                // Stop streaming if active
+                if (isStreaming) {
+                    stopStreaming(false); // false = don't send to server
+                }
+            });
             
-            ws.onerror = (error) => {
-                console.error('WebSocket error:', error);
+            socket.on('status', (data) => {
+                console.log('Status update:', data);
+                addSystemMessage(data.message);
+            });
+            
+            socket.on('error', (data) => {
+                console.error('Server error:', data);
+                addSystemMessage(`Error: ${data.message}`);
+            });
+            
+            socket.on('audio_response', (data) => {
+                console.log('Received audio response');
+                
+                // Play audio response
+                const audio = new Audio(data.audio);
+                audio.play();
+                
+                // Add message to conversation
+                addAIMessage(data.text || 'AI response', data.audio);
+                
+                // Reset UI state after AI response
+                if (isStreaming) {
+                    streamButton.textContent = 'Listening...';
+                    streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
+                    streamButton.style.backgroundColor = '#f44336';
+                    streamButton.classList.add('recording');
+                    streamButton.classList.remove('processing');
+                    isSpeaking = false; // Reset speaking state
+                }
+            });
+            
+            socket.on('transcription', (data) => {
+                console.log('Received transcription:', data);
+                addUserTranscription(data.text);
+            });
+            
+            socket.on('context_updated', (data) => {
+                console.log('Context updated:', data);
+                addSystemMessage(data.message);
+            });
+            
+            socket.on('streaming_status', (data) => {
+                console.log('Streaming status:', data);
+                addSystemMessage(`Streaming ${data.status}`);
+            });
+            
+            socket.on('connect_error', (error) => {
+                console.error('Connection error:', error);
                 statusDot.classList.remove('active');
-                statusText.textContent = 'Error';
-                addSystemMessage('Connection error');
-            };
+                statusText.textContent = 'Connection Error';
+                addSystemMessage('Failed to connect to server');
+                streamButton.disabled = true;
+            });
         }
         
         // Toggle streaming
         function toggleStreaming() {
             if (isStreaming) {
-                stopStreaming();
+                stopStreaming(true); // true = send to server
             } else {
                 startStreaming();
             }
@@ -295,49 +393,52 @@
         // Start streaming
         async function startStreaming() {
             try {
+                // Request microphone access
                 const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                 const speaker = parseInt(speakerSelectEl.value);
                 
+                // Update state
                 isStreaming = true;
                 isSpeaking = false;
                 energyWindow = [];
                 
-                streamButton.textContent = 'Listening...';
+                // Update UI
+                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
                 streamButton.classList.add('recording');
                 
-                // Create audio processor node
+                // Setup audio analysis
                 const source = audioContext.createMediaStreamSource(stream);
                 
-                // Set up analyser for visualization with better settings
+                // Setup analyzer for visualization
                 analyser = audioContext.createAnalyser();
                 analyser.fftSize = 256;
-                analyser.smoothingTimeConstant = 0.8; // Add smoothing for nicer visualization
+                analyser.smoothingTimeConstant = 0.8;
                 analyser.minDecibels = -90;
                 analyser.maxDecibels = -10;
                 
                 visualizerBufferLength = analyser.frequencyBinCount;
                 visualizerDataArray = new Uint8Array(visualizerBufferLength);
                 
-                // Connect source to analyzer first
+                // Connect source to analyzer
                 source.connect(analyser);
                 
-                // Hide the label when visualization is active
+                // Hide visualizer label
                 visualizerLabel.style.opacity = '0';
                 
-                // Start drawing the visualization
+                // Start visualization
                 if (visualizerAnimationFrame) {
                     cancelAnimationFrame(visualizerAnimationFrame);
                 }
                 drawVisualizer();
                 
-                // Set up processor for audio processing
+                // Setup audio processor
                 streamProcessor = audioContext.createScriptProcessor(4096, 1, 1);
                 
-                // Connect nodes
+                // Connect audio nodes
                 source.connect(streamProcessor);
                 streamProcessor.connect(audioContext.destination);
                 
-                // Process and send audio data
+                // Process audio
                 streamProcessor.onaudioprocess = function(e) {
                     const audioData = e.inputBuffer.getChannelData(0);
                     
@@ -349,10 +450,10 @@
                     const avgEnergy = calculateAverageEnergy();
                     const isSilent = avgEnergy < CLIENT_SILENCE_THRESHOLD;
                     
-                    // Handle silence/speech transitions for visual feedback
+                    // Handle silence/speech transitions
                     handleSpeechState(isSilent);
                     
-                    // Continue processing audio regardless of silence state
+                    // Process and send audio
                     const downsampled = downsampleBuffer(audioData, audioContext.sampleRate, 24000);
                     sendAudioChunk(downsampled, speaker);
                 };
@@ -363,8 +464,71 @@
                 console.error('Error starting audio stream:', err);
                 addSystemMessage(`Microphone error: ${err.message}`);
                 isStreaming = false;
-                streamButton.textContent = 'Start Conversation';
-                streamButton.classList.remove('recording');
+                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
+                streamButton.classList.remove('recording', 'processing');
+            }
+        }
+        
+        // Stop streaming
+        function stopStreaming(sendToServer = true) {
+            // Disconnect audio nodes
+            if (streamProcessor) {
+                streamProcessor.disconnect();
+                streamProcessor = null;
+            }
+            
+            if (analyser) {
+                analyser.disconnect();
+                analyser = null;
+            }
+            
+            // Stop visualization
+            if (visualizerAnimationFrame) {
+                cancelAnimationFrame(visualizerAnimationFrame);
+                visualizerAnimationFrame = null;
+            }
+            
+            // Clear canvas
+            if (canvasContext) {
+                canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+                visualizerLabel.style.opacity = '0.7';
+            }
+            
+            // Clear silence timer
+            if (silenceTimer) {
+                clearTimeout(silenceTimer);
+                silenceTimer = null;
+            }
+            
+            // Reset state
+            isStreaming = false;
+            isSpeaking = false;
+            energyWindow = [];
+            
+            // Update UI
+            streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
+            streamButton.classList.remove('recording', 'processing');
+            streamButton.style.backgroundColor = '';
+            
+            addSystemMessage('Conversation paused');
+            
+            // Notify server
+            if (sendToServer && socket.connected) {
+                socket.emit('stop_streaming', {
+                    speaker: parseInt(speakerSelectEl.value)
+                });
+            }
+        }
+        
+        // Clear conversation
+        function clearConversation() {
+            // Clear UI
+            conversationEl.innerHTML = '';
+            addSystemMessage('Conversation cleared');
+            
+            // Notify server
+            if (socket.connected) {
+                socket.emit('clear_context');
             }
         }
         
@@ -377,7 +541,7 @@
             return sum / buffer.length;
         }
         
-        // Update the sliding energy window
+        // Update energy window
         function updateEnergyWindow(energy) {
             energyWindow.push(energy);
             if (energyWindow.length > ENERGY_WINDOW_SIZE) {
@@ -385,20 +549,20 @@
             }
         }
         
-        // Calculate average energy from the window
+        // Calculate average energy
         function calculateAverageEnergy() {
             if (energyWindow.length === 0) return 0;
             return energyWindow.reduce((sum, val) => sum + val, 0) / energyWindow.length;
         }
         
-        // Handle speech state changes and visual feedback
+        // Handle speech state changes
         function handleSpeechState(isSilent) {
             if (isSpeaking && isSilent) {
                 // Transition from speaking to silence
                 if (!silenceTimer) {
                     silenceTimer = setTimeout(() => {
                         // Silence persisted long enough
-                        streamButton.textContent = 'Processing...';
+                        streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
                         streamButton.classList.remove('recording');
                         streamButton.classList.add('processing');
                         addSystemMessage('Detected pause in speech, processing response...');
@@ -407,24 +571,24 @@
             } else if (!isSpeaking && !isSilent) {
                 // Transition from silence to speaking
                 isSpeaking = true;
-                streamButton.textContent = 'Listening...';
+                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
                 streamButton.classList.add('recording');
                 streamButton.classList.remove('processing');
                 
-                // Clear any pending silence timer
+                // Clear silence timer
                 if (silenceTimer) {
                     clearTimeout(silenceTimer);
                     silenceTimer = null;
                 }
             } else if (isSpeaking && !isSilent) {
-                // Still speaking, reset any silence timer
+                // Still speaking, reset silence timer
                 if (silenceTimer) {
                     clearTimeout(silenceTimer);
                     silenceTimer = null;
                 }
             }
             
-            // Update speaking state
+            // Update speaking state for non-silent audio
             if (!isSilent) {
                 isSpeaking = true;
             }
@@ -432,83 +596,93 @@
         
         // Send audio chunk to server
         function sendAudioChunk(audioData, speaker) {
+            if (!socket || !socket.connected) {
+                console.warn('Cannot send audio: socket not connected');
+                return;
+            }
+            
             const wavData = createWavBlob(audioData, 24000);
             const reader = new FileReader();
             
             reader.onloadend = function() {
                 const base64data = reader.result;
                 
-                // Send to server
-                ws.send(JSON.stringify({
-                    action: 'stream_audio',
+                // Send to server using Socket.IO
+                socket.emit('stream_audio', {
                     speaker: speaker,
                     audio: base64data
-                }));
+                });
             };
             
             reader.readAsDataURL(wavData);
         }
         
-        // Stop streaming
-        function stopStreaming() {
-            if (streamProcessor) {
-                streamProcessor.disconnect();
-                streamProcessor = null;
+        // Visualization function
+        function drawVisualizer() {
+            if (!canvasContext) {
+                console.error("Canvas context not available");
+                return;
             }
             
-            if (analyser) {
-                analyser.disconnect();
-                analyser = null;
+            visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
+            
+            // Get frequency data if available
+            if (isStreaming && analyser) {
+                try {
+                    analyser.getByteFrequencyData(visualizerDataArray);
+                } catch (e) {
+                    console.error("Error getting frequency data:", e);
+                }
+            } else {
+                // Fade out when not streaming
+                for (let i = 0; i < visualizerDataArray.length; i++) {
+                    visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
+                }
             }
             
-            // Stop the visualization
-            if (visualizerAnimationFrame) {
-                cancelAnimationFrame(visualizerAnimationFrame);
-                visualizerAnimationFrame = null;
+            // Clear canvas
+            canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
+            canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+            
+            // Draw bars
+            const width = visualizerCanvas.width;
+            const height = visualizerCanvas.height;
+            const barCount = Math.min(visualizerBufferLength, 64);
+            const barWidth = width / barCount - 1;
+            
+            for (let i = 0; i < barCount; i++) {
+                const index = Math.floor(i * visualizerBufferLength / barCount);
+                const value = visualizerDataArray[index];
+                
+                const barHeight = (value / 255) * height;
+                const x = i * (barWidth + 1);
+                
+                // Color based on frequency
+                const hue = 200 + (i / barCount * 60);
+                const saturation = 90 - (value / 255 * 30);
+                const lightness = 40 + (value / 255 * 30);
+                
+                // Draw bar
+                canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
+                canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
+                
+                // Add reflection effect
+                const gradientHeight = Math.min(10, barHeight / 3);
+                const gradient = canvasContext.createLinearGradient(
+                    0, height - barHeight, 
+                    0, height - barHeight + gradientHeight
+                );
+                gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
+                gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
+                canvasContext.fillStyle = gradient;
+                canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
             }
             
-            // Clear the canvas
-            if (canvasContext) {
-                canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
-                visualizerLabel.style.opacity = '0.7';
-            }
-            
-            // Clear any pending silence timer
-            if (silenceTimer) {
-                clearTimeout(silenceTimer);
-                silenceTimer = null;
-            }
-            
-            isStreaming = false;
-            isSpeaking = false;
-            energyWindow = [];
-            
-            streamButton.textContent = 'Start Conversation';
-            streamButton.classList.remove('recording', 'processing');
-            streamButton.style.backgroundColor = ''; // Reset to default
-            
-            addSystemMessage('Conversation paused');
-            
-            // Send stop streaming signal to server
-            ws.send(JSON.stringify({
-                action: 'stop_streaming',
-                speaker: parseInt(speakerSelectEl.value)
-            }));
+            // Show/hide the label
+            visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
         }
         
-        // Clear conversation
-        function clearConversation() {
-            // Clear conversation history
-            ws.send(JSON.stringify({
-                action: 'clear_context'
-            }));
-            
-            // Clear the UI
-            conversationEl.innerHTML = '';
-            addSystemMessage('Conversation cleared');
-        }
-        
-        // Downsample audio buffer to target sample rate
+        // Downsample audio buffer
         function downsampleBuffer(buffer, sampleRate, targetSampleRate) {
             if (targetSampleRate === sampleRate) {
                 return buffer;
@@ -538,7 +712,7 @@
             return result;
         }
         
-        // Create WAV blob from Float32Array
+        // Create WAV blob
         function createWavBlob(samples, sampleRate) {
             const buffer = new ArrayBuffer(44 + samples.length * 2);
             const view = new DataView(buffer);
@@ -562,8 +736,7 @@
             writeString(view, 36, 'data');
             view.setUint32(40, samples.length * 2, true);
             
-            // Write the PCM samples
-            const volume = 0.5;
+            // Write PCM samples
             for (let i = 0; i < samples.length; i++) {
                 const sample = Math.max(-1, Math.min(1, samples[i]));
                 view.setInt16(44 + i * 2, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true);
@@ -572,19 +745,19 @@
             return new Blob([buffer], { type: 'audio/wav' });
         }
         
+        // Write string to DataView
         function writeString(view, offset, string) {
             for (let i = 0; i < string.length; i++) {
                 view.setUint8(offset + i, string.charCodeAt(i));
             }
         }
         
-        // Message display functions
+        // Add user transcription
         function addUserTranscription(text) {
-            // Find if there's already a pending user message
+            // Find or create user message
             let pendingMessage = document.querySelector('.message.user.pending');
             
             if (!pendingMessage) {
-                // Create a new message
                 pendingMessage = document.createElement('div');
                 pendingMessage.classList.add('message', 'user', 'pending');
                 conversationEl.appendChild(pendingMessage);
@@ -595,6 +768,7 @@
             conversationEl.scrollTop = conversationEl.scrollHeight;
         }
         
+        // Add AI message
         function addAIMessage(text, audioSrc) {
             const messageEl = document.createElement('div');
             messageEl.classList.add('message', 'ai');
@@ -614,6 +788,7 @@
             conversationEl.scrollTop = conversationEl.scrollHeight;
         }
         
+        // Add system message
         function addSystemMessage(text) {
             const messageEl = document.createElement('div');
             messageEl.classList.add('message', 'system');
@@ -621,98 +796,6 @@
             conversationEl.appendChild(messageEl);
             conversationEl.scrollTop = conversationEl.scrollHeight;
         }
-        
-        // Setup the audio visualizer
-        function setupVisualizer() {
-            visualizerCanvas = document.getElementById('audioVisualizer');
-            canvasContext = visualizerCanvas.getContext('2d');
-            
-            // Set canvas size to match container
-            function resizeCanvas() {
-                const container = visualizerCanvas.parentElement;
-                visualizerCanvas.width = container.clientWidth;
-                visualizerCanvas.height = container.clientHeight;
-            }
-            
-            // Call initially and on window resize
-            resizeCanvas();
-            window.addEventListener('resize', resizeCanvas);
-            
-            // Create placeholder data array (will be used before streaming starts)
-            visualizerBufferLength = 128; // Default size
-            visualizerDataArray = new Uint8Array(visualizerBufferLength);
-        }
-        
-        // Add the visualization drawing function
-        function drawVisualizer() {
-            // Ensure we have the canvas context
-            if (!canvasContext) {
-                console.error("Canvas context not available");
-                return;
-            }
-            
-            visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
-            
-            // If we're streaming and have an analyzer, get the frequency data
-            if (isStreaming && analyser) {
-                try {
-                    analyser.getByteFrequencyData(visualizerDataArray);
-                } catch (e) {
-                    console.error("Error getting frequency data:", e);
-                }
-            } else {
-                // If not streaming, gradually reduce all values to create a fade-out effect
-                for (let i = 0; i < visualizerDataArray.length; i++) {
-                    visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
-                }
-            }
-            
-            // Clear the canvas with a very slight background
-            canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
-            canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
-            
-            // Calculate bar width based on canvas size and buffer length
-            const width = visualizerCanvas.width;
-            const height = visualizerCanvas.height;
-            const barCount = Math.min(visualizerBufferLength, 64); // Limit bars for performance
-            const barWidth = width / barCount - 1;  // Leave 1px gap
-            
-            // Draw bars
-            for (let i = 0; i < barCount; i++) {
-                // Use a logarithmic scale for better visualization of lower frequencies
-                const index = Math.floor(i * visualizerBufferLength / barCount);
-                const value = visualizerDataArray[index];
-                
-                // Scale height (values typically range from 0-255)
-                const barHeight = (value / 255) * height;
-                
-                // Position x coordinate
-                const x = i * (barWidth + 1);
-                
-                // Calculate gradient color based on frequency
-                const hue = 200 + (i / barCount * 60); // Blue to light-blue/cyan spectrum
-                const saturation = 90 - (value / 255 * 30); // More saturated for louder sounds
-                const lightness = 40 + (value / 255 * 30); // Brighter for louder sounds
-                
-                // Draw the bar
-                canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
-                canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
-                
-                // Add a subtle reflection
-                const gradientHeight = Math.min(10, barHeight / 3);
-                const gradient = canvasContext.createLinearGradient(
-                    0, height - barHeight, 
-                    0, height - barHeight + gradientHeight
-                );
-                gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
-                gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
-                canvasContext.fillStyle = gradient;
-                canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
-            }
-            
-            // Only show the label when not streaming
-            visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
-        }
     </script>
 </body>
 </html>
\ No newline at end of file
diff --git a/Backend/server.py b/Backend/server.py
index f159025..e986606 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -1,24 +1,20 @@
 import os
 import base64
 import json
-import asyncio
 import torch
 import torchaudio
 import numpy as np
-import io
 import whisperx
 from io import BytesIO
 from typing import List, Dict, Any, Optional
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
-from fastapi.responses import HTMLResponse, FileResponse
-from fastapi.staticfiles import StaticFiles
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
+from flask import Flask, request, send_from_directory, Response
+from flask_cors import CORS
+from flask_socketio import SocketIO, emit, disconnect
 from generator import load_csm_1b, Segment
-import uvicorn
 import time
 import gc
 from collections import deque
+from threading import Lock
 
 # Select device
 if torch.cuda.is_available():
@@ -36,73 +32,39 @@ print("Loading WhisperX model...")
 asr_model = whisperx.load_model("medium", device, compute_type="float16")
 print("WhisperX model loaded!")
 
-app = FastAPI()
-
-# Add CORS middleware to allow cross-origin requests
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # Allow all origins in development
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
+# Silence detection parameters
+SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
+SILENCE_DURATION_SEC = 1.0  # How long silence must persist
 
 # Define the base directory
 base_dir = os.path.dirname(os.path.abspath(__file__))
-
-# Mount a static files directory if you have any static assets like CSS or JS
 static_dir = os.path.join(base_dir, "static")
-os.makedirs(static_dir, exist_ok=True)  # Create the directory if it doesn't exist
-app.mount("/static", StaticFiles(directory=static_dir), name="static")
+os.makedirs(static_dir, exist_ok=True)
 
-# Define route to serve index.html as the main page
-@app.get("/", response_class=HTMLResponse)
-async def get_index():
-    try:
-        with open(os.path.join(base_dir, "index.html"), "r") as f:
-            return HTMLResponse(content=f.read())
-    except FileNotFoundError:
-        return HTMLResponse(content="<html><body><h1>Error: index.html not found</h1></body></html>")
+# Setup Flask
+app = Flask(__name__)
+CORS(app)
+socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
 
-# Add a favicon endpoint (optional, but good to have)
-@app.get("/favicon.ico")
-async def get_favicon():
-    favicon_path = os.path.join(static_dir, "favicon.ico")
-    if os.path.exists(favicon_path):
-        return FileResponse(favicon_path)
-    else:
-        return HTMLResponse(status_code=204)  # No content
-
-# Connection manager to handle multiple clients
-class ConnectionManager:
-    def __init__(self):
-        self.active_connections: List[WebSocket] = []
-
-    async def connect(self, websocket: WebSocket):
-        await websocket.accept()
-        self.active_connections.append(websocket)
-
-    def disconnect(self, websocket: WebSocket):
-        self.active_connections.remove(websocket)
-
-manager = ConnectionManager()
-
-# Silence detection parameters
-SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
-SILENCE_DURATION_SEC = 1.0  # How long silence must persist to be considered "stopped talking"
+# Socket connection management
+thread = None
+thread_lock = Lock()
+active_clients = {}  # Map client_id to client context
 
 # Helper function to convert audio data
-async def decode_audio_data(audio_data: str) -> torch.Tensor:
+def decode_audio_data(audio_data: str) -> torch.Tensor:
     """Decode base64 audio data to a torch tensor"""
     try:
+        # Extract the actual base64 content
+        if ',' in audio_data:
+            audio_data = audio_data.split(',')[1]
+        
         # Decode base64 audio data
-        binary_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data)
+        binary_data = base64.b64decode(audio_data)
         
-        # Save to a temporary WAV file first
-        temp_file = BytesIO(binary_data)
-        
-        # Load audio from binary data, explicitly specifying the format
-        audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
+        # Load audio from binary data
+        with BytesIO(binary_data) as temp_file:
+            audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
         
         # Resample if needed
         if sample_rate != generator.sample_rate:
@@ -121,7 +83,7 @@ async def decode_audio_data(audio_data: str) -> torch.Tensor:
         return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence
 
 
-async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
+def encode_audio_data(audio_tensor: torch.Tensor) -> str:
     """Encode torch tensor audio to base64 string"""
     buf = BytesIO()
     torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
@@ -130,40 +92,36 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
     return f"data:audio/wav;base64,{audio_base64}"
 
 
-async def transcribe_audio(audio_tensor: torch.Tensor) -> str:
+def transcribe_audio(audio_tensor: torch.Tensor) -> str:
     """Transcribe audio using WhisperX"""
     try:
         # Save the tensor to a temporary file
-        temp_file = BytesIO()
-        torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
-        temp_file.seek(0)
-        
-        # Create a temporary file on disk (WhisperX requires a file path)
-        temp_path = "temp_audio.wav"
-        with open(temp_path, "wb") as f:
-            f.write(temp_file.read())
+        temp_path = os.path.join(base_dir, "temp_audio.wav")
+        torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
         
         # Load and transcribe the audio
         audio = whisperx.load_audio(temp_path)
         result = asr_model.transcribe(audio, batch_size=16)
         
         # Clean up
-        os.remove(temp_path)
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
         
         # Get the transcription text
         if result["segments"] and len(result["segments"]) > 0:
             # Combine all segments
             transcription = " ".join([segment["text"] for segment in result["segments"]])
-            print(f"Transcription: {transcription}")
             return transcription.strip()
         else:
             return ""
     except Exception as e:
         print(f"Error in transcription: {str(e)}")
+        if os.path.exists("temp_audio.wav"):
+            os.remove("temp_audio.wav")
         return ""
 
 
-async def generate_response(text: str, conversation_history: List[Segment]) -> str:
+def generate_response(text: str, conversation_history: List[Segment]) -> str:
     """Generate a contextual response based on the transcribed text"""
     # Simple response logic - can be replaced with a more sophisticated LLM in the future
     responses = {
@@ -191,311 +149,319 @@ async def generate_response(text: str, conversation_history: List[Segment]) -> s
     else:
         return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?"
 
+# Flask routes for serving static content
+@app.route('/')
+def index():
+    return send_from_directory(base_dir, 'index.html')
 
-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket):
-    await manager.connect(websocket)
-    context_segments = []  # Store conversation context
-    streaming_buffer = []  # Buffer for streaming audio chunks
-    is_streaming = False
+@app.route('/favicon.ico')
+def favicon():
+    if os.path.exists(os.path.join(static_dir, 'favicon.ico')):
+        return send_from_directory(static_dir, 'favicon.ico')
+    return Response(status=204)
+
+@app.route('/static/<path:path>')
+def serve_static(path):
+    return send_from_directory(static_dir, path)
+
+# Socket.IO event handlers
+@socketio.on('connect')
+def handle_connect():
+    client_id = request.sid
+    print(f"Client connected: {client_id}")
     
-    # Variables for silence detection
-    last_active_time = time.time()
-    is_silence = False
-    energy_window = deque(maxlen=10)  # For tracking recent audio energy
+    # Initialize client context
+    active_clients[client_id] = {
+        'context_segments': [],
+        'streaming_buffer': [],
+        'is_streaming': False,
+        'is_silence': False,
+        'last_active_time': time.time(),
+        'energy_window': deque(maxlen=10)
+    }
+    
+    emit('status', {'type': 'connected', 'message': 'Connected to server'})
+
+@socketio.on('disconnect')
+def handle_disconnect():
+    client_id = request.sid
+    if client_id in active_clients:
+        del active_clients[client_id]
+    print(f"Client disconnected: {client_id}")
+
+@socketio.on('generate')
+def handle_generate(data):
+    client_id = request.sid
+    if client_id not in active_clients:
+        emit('error', {'message': 'Client not registered'})
+        return
     
     try:
-        while True:
-            # Receive JSON data from client
-            data = await websocket.receive_text()
-            request = json.loads(data)
-            
-            action = request.get("action")
-            
-            if action == "generate":
-                try:
-                    text = request.get("text", "")
-                    speaker_id = request.get("speaker", 0)
-                    
-                    # Generate audio response
-                    print(f"Generating audio for: '{text}' with speaker {speaker_id}")
-                    audio_tensor = generator.generate(
-                        text=text,
-                        speaker=speaker_id,
-                        context=context_segments,
-                        max_audio_length_ms=10_000,
-                    )
-                    
-                    # Add to conversation context
-                    context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
-                    
-                    # Convert audio to base64 and send back to client
-                    audio_base64 = await encode_audio_data(audio_tensor)
-                    await websocket.send_json({
-                        "type": "audio_response",
-                        "audio": audio_base64
-                    })
-                except Exception as e:
-                    print(f"Error generating audio: {str(e)}")
-                    await websocket.send_json({
-                        "type": "error",
-                        "message": f"Error generating audio: {str(e)}"
-                    })
-                
-            elif action == "add_to_context":
-                try:
-                    text = request.get("text", "")
-                    speaker_id = request.get("speaker", 0)
-                    audio_data = request.get("audio", "")
-                    
-                    # Convert received audio to tensor
-                    audio_tensor = await decode_audio_data(audio_data)
-                    
-                    # Add to conversation context
-                    context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
-                    
-                    await websocket.send_json({
-                        "type": "context_updated",
-                        "message": "Audio added to context"
-                    })
-                except Exception as e:
-                    print(f"Error adding to context: {str(e)}")
-                    await websocket.send_json({
-                        "type": "error",
-                        "message": f"Error processing audio: {str(e)}"
-                    })
-                
-            elif action == "clear_context":
-                context_segments = []
-                await websocket.send_json({
-                    "type": "context_updated",
-                    "message": "Context cleared"
-                })
-            
-            elif action == "stream_audio":
-                try:
-                    speaker_id = request.get("speaker", 0)
-                    audio_data = request.get("audio", "")
-                    
-                    # Convert received audio to tensor
-                    audio_chunk = await decode_audio_data(audio_data)
-                    
-                    # Start streaming mode if not already started
-                    if not is_streaming:
-                        is_streaming = True
-                        streaming_buffer = []
-                        energy_window.clear()
-                        is_silence = False
-                        last_active_time = time.time()
-                        print(f"Streaming started with speaker ID: {speaker_id}")
-                        await websocket.send_json({
-                            "type": "streaming_status",
-                            "status": "started"
-                        })
-                    
-                    # Calculate audio energy for silence detection
-                    chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
-                    energy_window.append(chunk_energy)
-                    avg_energy = sum(energy_window) / len(energy_window)
-                    
-                    # Debug audio levels
-                    if len(energy_window) >= 5:  # Only start printing after we have enough samples
-                        if avg_energy > SILENCE_THRESHOLD:
-                            print(f"[AUDIO] Active sound detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
-                        else:
-                            print(f"[AUDIO] Silence detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
-                    
-                    # Check if audio is silent
-                    current_silence = avg_energy < SILENCE_THRESHOLD
-                    
-                    # Track silence transition
-                    if not is_silence and current_silence:
-                        # Transition to silence
-                        is_silence = True
-                        last_active_time = time.time()
-                        print("[STREAM] Transition to silence detected")
-                    elif is_silence and not current_silence:
-                        # User started talking again
-                        is_silence = False
-                        print("[STREAM] User resumed speaking")
-                    
-                    # Add chunk to buffer regardless of silence state
-                    streaming_buffer.append(audio_chunk)
-                    
-                    # Debug buffer size periodically
-                    if len(streaming_buffer) % 10 == 0:
-                        print(f"[BUFFER] Current size: {len(streaming_buffer)} chunks, ~{len(streaming_buffer)/5:.1f} seconds")
-                        
-                    # Check if silence has persisted long enough to consider "stopped talking"
-                    silence_elapsed = time.time() - last_active_time
-                    
-                    if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0:
-                        # User has stopped talking - process the collected audio
-                        print(f"[STREAM] Processing audio after {silence_elapsed:.2f}s of silence")
-                        print(f"[STREAM] Processing {len(streaming_buffer)} audio chunks (~{len(streaming_buffer)/5:.1f} seconds)")
-                        
-                        full_audio = torch.cat(streaming_buffer, dim=0)
-                        
-                        # Log audio statistics
-                        audio_duration = len(full_audio) / generator.sample_rate
-                        audio_min = torch.min(full_audio).item()
-                        audio_max = torch.max(full_audio).item()
-                        audio_mean = torch.mean(full_audio).item()
-                        print(f"[AUDIO] Processed audio - Duration: {audio_duration:.2f}s, Min: {audio_min:.4f}, Max: {audio_max:.4f}, Mean: {audio_mean:.4f}")
-                        
-                        # Process with WhisperX speech-to-text
-                        print("[ASR] Starting transcription with WhisperX...")
-                        transcribed_text = await transcribe_audio(full_audio)
-                        
-                        # Log the transcription
-                        print(f"[ASR] Transcribed text: '{transcribed_text}'")
-                        
-                        # Add to conversation context
-                        if transcribed_text:
-                            print(f"[DIALOG] Adding user utterance to context: '{transcribed_text}'")
-                            user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
-                            context_segments.append(user_segment)
-                            
-                            # Generate a contextual response
-                            print("[DIALOG] Generating response...")
-                            response_text = await generate_response(transcribed_text, context_segments)
-                            print(f"[DIALOG] Response text: '{response_text}'")
-                            
-                            # Send the transcribed text to client
-                            await websocket.send_json({
-                                "type": "transcription",
-                                "text": transcribed_text
-                            })
-                            
-                            # Generate audio for the response
-                            print("[TTS] Generating speech for response...")
-                            audio_tensor = generator.generate(
-                                text=response_text,
-                                speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
-                                context=context_segments,
-                                max_audio_length_ms=10_000,
-                            )
-                            print(f"[TTS] Generated audio length: {len(audio_tensor)/generator.sample_rate:.2f}s")
-                            
-                            # Add response to context
-                            ai_segment = Segment(
-                                text=response_text, 
-                                speaker=1 if speaker_id == 0 else 0, 
-                                audio=audio_tensor
-                            )
-                            context_segments.append(ai_segment)
-                            print(f"[DIALOG] Context now has {len(context_segments)} segments")
-                            
-                            # Convert audio to base64 and send back to client
-                            audio_base64 = await encode_audio_data(audio_tensor)
-                            print("[STREAM] Sending audio response to client")
-                            await websocket.send_json({
-                                "type": "audio_response",
-                                "text": response_text,
-                                "audio": audio_base64
-                            })
-                        else:
-                            print("[ASR] Transcription failed or returned empty text")
-                            # If transcription failed, send a generic response
-                            await websocket.send_json({
-                                "type": "error",
-                                "message": "Sorry, I couldn't understand what you said. Could you try again?"
-                            })
-                        
-                        # Clear buffer and reset silence detection
-                        streaming_buffer = []
-                        energy_window.clear()
-                        is_silence = False
-                        last_active_time = time.time()
-                        print("[STREAM] Buffer cleared, ready for next utterance")
-                    
-                    # If buffer gets too large without silence, process it anyway
-                    # This prevents memory issues with very long streams
-                    elif len(streaming_buffer) >= 30:  # ~6 seconds of audio at 5 chunks/sec
-                        print("[BUFFER] Maximum buffer size reached, processing audio")
-                        full_audio = torch.cat(streaming_buffer, dim=0)
-                        
-                        # Process with WhisperX speech-to-text
-                        print("[ASR] Starting forced transcription of long audio...")
-                        transcribed_text = await transcribe_audio(full_audio)
-                        
-                        if transcribed_text:
-                            print(f"[ASR] Transcribed long audio: '{transcribed_text}'")
-                            context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio))
-                            
-                            # Send the transcribed text to client
-                            await websocket.send_json({
-                                "type": "transcription",
-                                "text": transcribed_text + " (processing continued speech...)"
-                            })
-                        else:
-                            print("[ASR] No transcription from long audio")
-                        
-                        streaming_buffer = []
-                        print("[BUFFER] Buffer cleared due to size limit")
-                        
-                except Exception as e:
-                    print(f"[ERROR] Processing streaming audio: {str(e)}")
-                    # Print traceback for more detailed error information
-                    import traceback
-                    traceback.print_exc()
-                    await websocket.send_json({
-                        "type": "error",
-                        "message": f"Error processing streaming audio: {str(e)}"
-                    })
-            
-            elif action == "stop_streaming":
-                is_streaming = False
-                if streaming_buffer and len(streaming_buffer) > 5:  # Only process if there's meaningful audio
-                    # Process any remaining audio in the buffer
-                    full_audio = torch.cat(streaming_buffer, dim=0)
-                    
-                    # Process with WhisperX speech-to-text
-                    transcribed_text = await transcribe_audio(full_audio)
-                    
-                    if transcribed_text:
-                        context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio))
-                        
-                        # Send the transcribed text to client
-                        await websocket.send_json({
-                            "type": "transcription",
-                            "text": transcribed_text
-                        })
-                
-                streaming_buffer = []
-                await websocket.send_json({
-                    "type": "streaming_status",
-                    "status": "stopped"
-                })
-    
-    except WebSocketDisconnect:
-        manager.disconnect(websocket)
-        print("Client disconnected")
+        text = data.get('text', '')
+        speaker_id = data.get('speaker', 0)
+        
+        print(f"Generating audio for: '{text}' with speaker {speaker_id}")
+        
+        # Generate audio response
+        audio_tensor = generator.generate(
+            text=text,
+            speaker=speaker_id,
+            context=active_clients[client_id]['context_segments'],
+            max_audio_length_ms=10_000,
+        )
+        
+        # Add to conversation context
+        active_clients[client_id]['context_segments'].append(
+            Segment(text=text, speaker=speaker_id, audio=audio_tensor)
+        )
+        
+        # Convert audio to base64 and send back to client
+        audio_base64 = encode_audio_data(audio_tensor)
+        emit('audio_response', {
+            'type': 'audio_response',
+            'audio': audio_base64
+        })
+        
     except Exception as e:
-        print(f"Error: {str(e)}")
-        try:
-            await websocket.send_json({
-                "type": "error",
-                "message": str(e)
-            })
-        except:
-            pass
-        manager.disconnect(websocket)
+        print(f"Error generating audio: {str(e)}")
+        emit('error', {
+            'type': 'error',
+            'message': f"Error generating audio: {str(e)}"
+        })
+
+@socketio.on('add_to_context')
+def handle_add_to_context(data):
+    client_id = request.sid
+    if client_id not in active_clients:
+        emit('error', {'message': 'Client not registered'})
+        return
+    
+    try:
+        text = data.get('text', '')
+        speaker_id = data.get('speaker', 0)
+        audio_data = data.get('audio', '')
+        
+        # Convert received audio to tensor
+        audio_tensor = decode_audio_data(audio_data)
+        
+        # Add to conversation context
+        active_clients[client_id]['context_segments'].append(
+            Segment(text=text, speaker=speaker_id, audio=audio_tensor)
+        )
+        
+        emit('context_updated', {
+            'type': 'context_updated',
+            'message': 'Audio added to context'
+        })
+        
+    except Exception as e:
+        print(f"Error adding to context: {str(e)}")
+        emit('error', {
+            'type': 'error',
+            'message': f"Error processing audio: {str(e)}"
+        })
+
+@socketio.on('clear_context')
+def handle_clear_context():
+    client_id = request.sid
+    if client_id in active_clients:
+        active_clients[client_id]['context_segments'] = []
+        
+    emit('context_updated', {
+        'type': 'context_updated',
+        'message': 'Context cleared'
+    })
+
+@socketio.on('stream_audio')
+def handle_stream_audio(data):
+    client_id = request.sid
+    if client_id not in active_clients:
+        emit('error', {'message': 'Client not registered'})
+        return
+    
+    client = active_clients[client_id]
+    
+    try:
+        speaker_id = data.get('speaker', 0)
+        audio_data = data.get('audio', '')
+        
+        # Convert received audio to tensor
+        audio_chunk = decode_audio_data(audio_data)
+        
+        # Start streaming mode if not already started
+        if not client['is_streaming']:
+            client['is_streaming'] = True
+            client['streaming_buffer'] = []
+            client['energy_window'].clear()
+            client['is_silence'] = False
+            client['last_active_time'] = time.time()
+            print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}")
+            emit('streaming_status', {
+                'type': 'streaming_status',
+                'status': 'started'
+            })
+        
+        # Calculate audio energy for silence detection
+        chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
+        client['energy_window'].append(chunk_energy)
+        avg_energy = sum(client['energy_window']) / len(client['energy_window'])
+        
+        # Check if audio is silent
+        current_silence = avg_energy < SILENCE_THRESHOLD
+        
+        # Track silence transition
+        if not client['is_silence'] and current_silence:
+            # Transition to silence
+            client['is_silence'] = True
+            client['last_active_time'] = time.time()
+        elif client['is_silence'] and not current_silence:
+            # User started talking again
+            client['is_silence'] = False
+        
+        # Add chunk to buffer regardless of silence state
+        client['streaming_buffer'].append(audio_chunk)
+            
+        # Check if silence has persisted long enough to consider "stopped talking"
+        silence_elapsed = time.time() - client['last_active_time']
+        
+        if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0:
+            # User has stopped talking - process the collected audio
+            print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence")
+            
+            full_audio = torch.cat(client['streaming_buffer'], dim=0)
+            
+            # Process with WhisperX speech-to-text
+            print(f"[{client_id}] Starting transcription with WhisperX...")
+            transcribed_text = transcribe_audio(full_audio)
+            
+            # Log the transcription
+            print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
+            
+            # Add to conversation context
+            if transcribed_text:
+                user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
+                client['context_segments'].append(user_segment)
+                
+                # Generate a contextual response
+                response_text = generate_response(transcribed_text, client['context_segments'])
+                
+                # Send the transcribed text to client
+                emit('transcription', {
+                    'type': 'transcription',
+                    'text': transcribed_text
+                })
+                
+                # Generate audio for the response
+                audio_tensor = generator.generate(
+                    text=response_text,
+                    speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
+                    context=client['context_segments'],
+                    max_audio_length_ms=10_000,
+                )
+                
+                # Add response to context
+                ai_segment = Segment(
+                    text=response_text, 
+                    speaker=1 if speaker_id == 0 else 0, 
+                    audio=audio_tensor
+                )
+                client['context_segments'].append(ai_segment)
+                
+                # Convert audio to base64 and send back to client
+                audio_base64 = encode_audio_data(audio_tensor)
+                emit('audio_response', {
+                    'type': 'audio_response',
+                    'text': response_text,
+                    'audio': audio_base64
+                })
+            else:
+                # If transcription failed, send a generic response
+                emit('error', {
+                    'type': 'error',
+                    'message': "Sorry, I couldn't understand what you said. Could you try again?"
+                })
+            
+            # Clear buffer and reset silence detection
+            client['streaming_buffer'] = []
+            client['energy_window'].clear()
+            client['is_silence'] = False
+            client['last_active_time'] = time.time()
+        
+        # If buffer gets too large without silence, process it anyway
+        elif len(client['streaming_buffer']) >= 30:  # ~6 seconds of audio at 5 chunks/sec
+            full_audio = torch.cat(client['streaming_buffer'], dim=0)
+            
+            # Process with WhisperX speech-to-text
+            transcribed_text = transcribe_audio(full_audio)
+            
+            if transcribed_text:
+                client['context_segments'].append(
+                    Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
+                )
+                
+                # Send the transcribed text to client
+                emit('transcription', {
+                    'type': 'transcription',
+                    'text': transcribed_text + " (processing continued speech...)"
+                })
+            
+            client['streaming_buffer'] = []
+            
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        print(f"Error processing streaming audio: {str(e)}")
+        emit('error', {
+            'type': 'error',
+            'message': f"Error processing streaming audio: {str(e)}"
+        })
+
+@socketio.on('stop_streaming')
+def handle_stop_streaming(data):
+    client_id = request.sid
+    if client_id not in active_clients:
+        return
+    
+    client = active_clients[client_id]
+    client['is_streaming'] = False
+    
+    if client['streaming_buffer'] and len(client['streaming_buffer']) > 5:
+        # Process any remaining audio in the buffer
+        full_audio = torch.cat(client['streaming_buffer'], dim=0)
+        
+        # Process with WhisperX speech-to-text
+        transcribed_text = transcribe_audio(full_audio)
+        
+        if transcribed_text:
+            client['context_segments'].append(
+                Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio)
+            )
+            
+            # Send the transcribed text to client
+            emit('transcription', {
+                'type': 'transcription',
+                'text': transcribed_text
+            })
+    
+    client['streaming_buffer'] = []
+    emit('streaming_status', {
+        'type': 'streaming_status',
+        'status': 'stopped'
+    })
 
-# Update the __main__ block with a comprehensive server startup message
 if __name__ == "__main__":
     print(f"\n{'='*60}")
-    print(f"🔊 Sesame AI Voice Chat Server")
+    print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
     print(f"{'='*60}")
     print(f"📡 Server Information:")
-    print(f"   - Local URL: http://localhost:8000")
-    print(f"   - Network URL: http://<your-ip-address>:8000")
-    print(f"   - WebSocket: ws://<your-ip-address>:8000/ws")
+    print(f"   - Local URL: http://localhost:5000")
+    print(f"   - Network URL: http://<your-ip-address>:5000")
+    print(f"   - WebSocket: ws://<your-ip-address>:5000/socket.io")
     print(f"{'='*60}")
     print(f"💡 To make this server public:")
-    print(f"   1. Ensure port 8000 is open in your firewall")
-    print(f"   2. Set up port forwarding on your router to port 8000")
-    print(f"   3. Or use a service like ngrok with: ngrok http 8000")
+    print(f"   1. Ensure port 5000 is open in your firewall")
+    print(f"   2. Set up port forwarding on your router to port 5000")
+    print(f"   3. Or use a service like ngrok with: ngrok http 5000")
     print(f"{'='*60}")
     print(f"🌐 Device: {device.upper()}")
     print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})")
@@ -503,5 +469,4 @@ if __name__ == "__main__":
     print(f"{'='*60}")
     print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")
     
-    # Start the server
-    uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file
+    socketio.run(app, host="0.0.0.0", port=5000, debug=False)
\ No newline at end of file

From 14c08bc93edffe68bbe25f88e9078d26043e26c3 Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 23:14:20 -0400
Subject: [PATCH 07/16] Demo Frontend Update

---
 Backend/index.html    | 1157 +++++++++++++++--------------------------
 Backend/voice-chat.js |  795 ++++++++++++++++++++++++++++
 2 files changed, 1219 insertions(+), 733 deletions(-)
 create mode 100644 Backend/voice-chat.js

diff --git a/Backend/index.html b/Backend/index.html
index 2944700..cbb4172 100644
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -1,801 +1,492 @@
-/Backend/index.html -->
 <!DOCTYPE html>
 <html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>Sesame AI Voice Chat</title>
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
     <!-- Socket.IO client library -->
     <script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
     <style>
+        :root {
+            --primary-color: #4c84ff;
+            --secondary-color: #3367d6;
+            --text-color: #333;
+            --background-color: #f9f9f9;
+            --card-background: #ffffff;
+            --accent-color: #ff5252;
+            --success-color: #4CAF50;
+            --border-color: #e0e0e0;
+            --shadow-color: rgba(0, 0, 0, 0.1);
+        }
+
+        * {
+            box-sizing: border-box;
+            margin: 0;
+            padding: 0;
+        }
+
         body {
-            font-family: 'Arial', sans-serif;
-            max-width: 800px;
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            background-color: var(--background-color);
+            color: var(--text-color);
+            line-height: 1.6;
+            max-width: 1000px;
             margin: 0 auto;
             padding: 20px;
-            background-color: #f9f9f9;
-            color: #333;
+            transition: all 0.3s ease;
         }
-        h1 {
+
+        header {
             text-align: center;
-            margin-bottom: 20px;
-            color: #1a73e8;
+            margin-bottom: 30px;
         }
-        .conversation {
-            border: 1px solid #ddd;
+
+        h1 {
+            color: var(--primary-color);
+            font-size: 2.5rem;
+            margin-bottom: 10px;
+        }
+
+        .subtitle {
+            color: #666;
+            font-weight: 300;
+        }
+
+        .app-container {
+            display: grid;
+            grid-template-columns: 1fr;
+            gap: 20px;
+        }
+
+        @media (min-width: 768px) {
+            .app-container {
+                grid-template-columns: 1fr 1fr;
+            }
+        }
+
+        .chat-container, .control-panel {
+            background-color: var(--card-background);
             border-radius: 12px;
+            box-shadow: 0 4px 12px var(--shadow-color);
             padding: 20px;
+        }
+
+        .control-panel {
+            display: flex;
+            flex-direction: column;
+            gap: 20px;
+        }
+
+        .chat-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 15px;
+            padding-bottom: 10px;
+            border-bottom: 1px solid var(--border-color);
+        }
+
+        .conversation {
             height: 400px;
             overflow-y: auto;
+            padding: 10px;
+            border-radius: 8px;
+            background-color: #f7f9fc;
             margin-bottom: 20px;
-            background-color: white;
-            box-shadow: 0 2px 10px rgba(0,0,0,0.05);
             scroll-behavior: smooth;
         }
+
         .message {
             margin-bottom: 15px;
-            padding: 12px;
+            padding: 12px 15px;
             border-radius: 12px;
-            max-width: 80%;
-            line-height: 1.4;
-            animation: message-appear 0.3s ease-out;
+            max-width: 85%;
+            position: relative;
+            animation: fade-in 0.3s ease-out forwards;
         }
-        .user {
-            background-color: #e3f2fd;
-            text-align: right;
-            margin-left: auto;
-            border-bottom-right-radius: 4px;
-        }
-        .ai {
-            background-color: #f1f1f1;
-            margin-right: auto;
-            border-bottom-left-radius: 4px;
-        }
-        .system {
-            background-color: #f8f9fa;
-            font-style: italic;
-            text-align: center;
-            font-size: 0.9em;
-            color: #666;
-            padding: 8px;
-            margin: 10px auto;
-            max-width: 90%;
-        }
-        .controls {
-            display: flex;
-            gap: 15px;
-            justify-content: center;
-            align-items: center;
-            margin-bottom: 15px;
-        }
-        button {
-            padding: 12px 24px;
-            border-radius: 24px;
-            border: none;
-            background-color: #4CAF50;
-            color: white;
-            cursor: pointer;
-            font-weight: bold;
-            transition: all 0.2s ease;
-            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
-            display: flex;
-            align-items: center;
-            justify-content: center;
-            gap: 8px;
-        }
-        button:hover {
-            background-color: #45a049;
-            box-shadow: 0 4px 8px rgba(0,0,0,0.15);
-        }
-        button:disabled {
-            background-color: #cccccc;
-            cursor: not-allowed;
-            opacity: 0.7;
-        }
-        .recording {
-            background-color: #f44336;
-            animation: pulse 1.5s infinite;
-        }
-        .processing {
-            background-color: #FFA500;
-        }
-        select {
-            padding: 10px;
-            border-radius: 24px;
-            border: 1px solid #ddd;
-            background-color: white;
-        }
-        .transcript {
-            font-style: italic;
-            color: #666;
-            margin-top: 5px;
-        }
-        @keyframes pulse {
-            0% { opacity: 1; }
-            50% { opacity: 0.7; }
-            100% { opacity: 1; }
-        }
-        @keyframes message-appear {
+
+        @keyframes fade-in {
             from { opacity: 0; transform: translateY(10px); }
             to { opacity: 1; transform: translateY(0); }
         }
-        .status-indicator {
-            display: flex;
-            align-items: center;
-            justify-content: center;
-            margin-top: 10px;
-            gap: 5px;
+
+        .user {
+            background-color: #e3f2fd;
+            color: #0d47a1;
+            margin-left: auto;
+            border-bottom-right-radius: 4px;
         }
-        .status-dot {
-            width: 10px;
-            height: 10px;
-            border-radius: 50%;
-            background-color: #ccc;
-            transition: background-color 0.3s ease;
+
+        .ai {
+            background-color: #f1f1f1;
+            color: #37474f;
+            margin-right: auto;
+            border-bottom-left-radius: 4px;
         }
-        .status-dot.active {
-            background-color: #4CAF50;
-        }
-        .status-text {
-            font-size: 0.9em;
+
+        .system {
+            background-color: #f8f9fa;
+            font-style: italic;
             color: #666;
-        }
-        audio {
-            width: 100%;
-            margin-top: 5px;
+            text-align: center;
+            max-width: 90%;
+            margin: 10px auto;
+            font-size: 0.9em;
+            padding: 8px 12px;
             border-radius: 8px;
         }
-        .visualizer-container {
+
+        .message-time {
+            font-size: 0.7em;
+            color: #999;
+            position: absolute;
+            bottom: 5px;
+            right: 10px;
+        }
+
+        .audio-player {
             width: 100%;
-            height: 60px;
-            background-color: #f5f5f5;
+            margin-top: 8px;
+            border-radius: 8px;
+        }
+
+        .visualizer-section {
+            margin-bottom: 20px;
+        }
+
+        .visualizer-container {
+            height: 150px;
+            background-color: #000;
             border-radius: 12px;
-            margin-bottom: 15px;
             overflow: hidden;
             position: relative;
-            box-shadow: inset 0 1px 3px rgba(0,0,0,0.1);
-        }
-        .audio-visualizer {
-            width: 100%;
-            height: 100%;
-            display: block;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
         }
+
         .visualizer-label {
             position: absolute;
             top: 50%;
             left: 50%;
             transform: translate(-50%, -50%);
-            color: #999;
-            font-size: 0.9em;
-            pointer-events: none;
-            opacity: 0.7;
+            color: rgba(255, 255, 255, 0.7);
+            font-size: 1rem;
             text-align: center;
-            width: 100%;
+            pointer-events: none;
             transition: opacity 0.3s ease;
+            z-index: 1;
         }
+
+        #audioVisualizer {
+            width: 100%;
+            height: 100%;
+            display: block;
+        }
+
+        .controls {
+            display: flex;
+            gap: 15px;
+            flex-wrap: wrap;
+        }
+
+        .control-group {
+            flex: 1;
+            min-width: 200px;
+        }
+
+        .control-label {
+            font-weight: 600;
+            margin-bottom: 10px;
+            color: #555;
+        }
+
+        .button-row {
+            display: flex;
+            gap: 10px;
+            margin-top: 15px;
+        }
+
+        button {
+            padding: 12px 20px;
+            border-radius: 8px;
+            border: none;
+            background-color: var(--primary-color);
+            color: white;
+            font-weight: 600;
+            cursor: pointer;
+            transition: all 0.2s ease;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            gap: 8px;
+            flex: 1;
+        }
+
+        button:hover {
+            background-color: var(--secondary-color);
+            transform: translateY(-2px);
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+        }
+
+        button:active {
+            transform: translateY(0);
+        }
+
+        button.recording {
+            background-color: var(--accent-color);
+            animation: pulse 1.5s infinite;
+        }
+
+        button.processing {
+            background-color: #ff9800;
+        }
+
+        @keyframes pulse {
+            0% { opacity: 1; }
+            50% { opacity: 0.8; }
+            100% { opacity: 1; }
+        }
+
+        select, .slider-container {
+            width: 100%;
+            padding: 10px;
+            border-radius: 8px;
+            border: 1px solid var(--border-color);
+            background-color: white;
+            margin-bottom: 15px;
+        }
+
+        .slider-container {
+            display: flex;
+            flex-direction: column;
+            gap: 5px;
+        }
+
+        .slider-label {
+            display: flex;
+            justify-content: space-between;
+        }
+
+        input[type="range"] {
+            width: 100%;
+            cursor: pointer;
+        }
+
+        .volume-indicator {
+            height: 30px;
+            background: linear-gradient(to right, #4CAF50, #FFEB3B, #F44336);
+            border-radius: 4px;
+            margin-top: 5px;
+            position: relative;
+            overflow: hidden;
+        }
+
+        .volume-level {
+            height: 100%;
+            width: 0%;
+            background-color: rgba(0, 0, 0, 0.5);
+            position: absolute;
+            right: 0;
+            top: 0;
+            transition: width 0.1s ease;
+        }
+
+        .status-indicator {
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            padding: 10px;
+            border-radius: 8px;
+            background-color: #f5f5f5;
+            margin-top: 20px;
+        }
+
+        .status-dot {
+            width: 12px;
+            height: 12px;
+            border-radius: 50%;
+            background-color: #ccc;
+            transition: background-color 0.3s ease;
+        }
+
+        .status-dot.active {
+            background-color: var(--success-color);
+        }
+
+        .status-text {
+            font-size: 0.9em;
+            color: #666;
+        }
+
+        /* Custom Scrollbar */
         .conversation::-webkit-scrollbar {
             width: 8px;
         }
+
         .conversation::-webkit-scrollbar-track {
             background: #f1f1f1;
             border-radius: 10px;
         }
+
         .conversation::-webkit-scrollbar-thumb {
             background: #ccc;
             border-radius: 10px;
         }
+
         .conversation::-webkit-scrollbar-thumb:hover {
             background: #aaa;
         }
+
+        /* Settings Panel */
+        .settings-panel {
+            margin-top: 20px;
+        }
+
+        .settings-toggles {
+            display: grid;
+            grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
+            gap: 10px;
+            margin-top: 10px;
+        }
+
+        .toggle-switch {
+            display: flex;
+            align-items: center;
+        }
+
+        .toggle-switch input {
+            opacity: 0;
+            width: 0;
+            height: 0;
+        }
+
+        .toggle-switch label {
+            position: relative;
+            display: inline-block;
+            width: 50px;
+            height: 24px;
+            background-color: #ccc;
+            border-radius: 34px;
+            transition: .4s;
+            margin-right: 10px;
+            cursor: pointer;
+        }
+
+        .toggle-switch label:before {
+            position: absolute;
+            content: "";
+            height: 16px;
+            width: 16px;
+            left: 4px;
+            bottom: 4px;
+            background-color: white;
+            transition: .4s;
+            border-radius: 50%;
+        }
+
+        .toggle-switch input:checked + label {
+            background-color: var(--primary-color);
+        }
+
+        .toggle-switch input:checked + label:before {
+            transform: translateX(26px);
+        }
+
+        footer {
+            text-align: center;
+            margin-top: 40px;
+            padding-top: 20px;
+            border-top: 1px solid var(--border-color);
+            color: #888;
+            font-size: 0.9em;
+        }
     </style>
 </head>
 <body>
-    <h1>Sesame AI Voice Chat</h1>
-    <div class="conversation" id="conversation"></div>
-    
-    <div class="visualizer-container">
-        <canvas id="audioVisualizer" class="audio-visualizer"></canvas>
-        <div id="visualizerLabel" class="visualizer-label">Audio levels will appear here when speaking</div>
-    </div>
-    
-    <div class="controls">
-        <select id="speakerSelect">
-            <option value="0">Speaker 0</option>
-            <option value="1">Speaker 1</option>
-        </select>
-        <button id="streamButton"><i class="fas fa-microphone"></i> Start Conversation</button>
-        <button id="clearButton"><i class="fas fa-trash"></i> Clear Chat</button>
-    </div>
-    
-    <div class="status-indicator">
-        <div class="status-dot" id="statusDot"></div>
-        <div class="status-text" id="statusText">Not connected</div>
+    <header>
+        <h1>Sesame AI Voice Chat</h1>
+        <p class="subtitle">Speak naturally and have a conversation with AI</p>
+    </header>
+
+    <div class="app-container">
+        <div class="chat-container">
+            <div class="chat-header">
+                <h2>Conversation</h2>
+                <button id="clearButton" class="small-button">
+                    <i class="fas fa-trash"></i> Clear Chat
+                </button>
+            </div>
+            <div class="conversation" id="conversation"></div>
+        </div>
+
+        <div class="control-panel">
+            <div class="visualizer-section">
+                <h3>Audio Visualizer</h3>
+                <div class="visualizer-container">
+                    <canvas id="audioVisualizer"></canvas>
+                    <div id="visualizerLabel" class="visualizer-label">Speak to see audio visualization</div>
+                </div>
+            </div>
+
+            <div class="controls">
+                <div class="control-group">
+                    <div class="control-label">Voice Settings</div>
+                    <select id="speakerSelect">
+                        <option value="0">Speaker 0 (You)</option>
+                        <option value="1">Speaker 1 (AI)</option>
+                    </select>
+                    
+                    <div class="slider-container">
+                        <div class="slider-label">
+                            <span>Silence Threshold</span>
+                            <span id="thresholdValue">0.01</span>
+                        </div>
+                        <input type="range" id="thresholdSlider" min="0.001" max="0.1" step="0.001" value="0.01">
+                    </div>
+                    
+                    <div class="volume-indicator">
+                        <div id="volumeLevel" class="volume-level"></div>
+                    </div>
+                </div>
+
+                <div class="control-group">
+                    <div class="control-label">Conversation Controls</div>
+                    <div class="button-row">
+                        <button id="streamButton" class="main-button">
+                            <i class="fas fa-microphone"></i> Start Conversation
+                        </button>
+                    </div>
+                </div>
+            </div>
+
+            <div class="settings-panel">
+                <div class="control-label">Settings</div>
+                <div class="settings-toggles">
+                    <div class="toggle-switch">
+                        <input type="checkbox" id="autoPlayResponses" checked>
+                        <label for="autoPlayResponses"></label>
+                        <span>Auto-play responses</span>
+                    </div>
+                    <div class="toggle-switch">
+                        <input type="checkbox" id="showVisualizer" checked>
+                        <label for="showVisualizer"></label>
+                        <span>Show visualizer</span>
+                    </div>
+                </div>
+            </div>
+
+            <div class="status-indicator">
+                <div class="status-dot" id="statusDot"></div>
+                <div class="status-text" id="statusText">Not connected</div>
+            </div>
+        </div>
     </div>
 
-    <script>
-        // Variables
-        let socket;
-        let audioContext;
-        let streamProcessor;
-        let isStreaming = false;
-        let isSpeaking = false;
-        let silenceTimer = null;
-        let energyWindow = [];
-        const ENERGY_WINDOW_SIZE = 10;
-        const CLIENT_SILENCE_THRESHOLD = 0.01;
-        const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second
-        
-        // Visualizer variables
-        let analyser;
-        let visualizerCanvas;
-        let canvasContext;
-        let visualizerBufferLength;
-        let visualizerDataArray;
-        let visualizerAnimationFrame;
-        
-        // DOM elements
-        const conversationEl = document.getElementById('conversation');
-        const speakerSelectEl = document.getElementById('speakerSelect');
-        const streamButton = document.getElementById('streamButton');
-        const clearButton = document.getElementById('clearButton');
-        const statusDot = document.getElementById('statusDot');
-        const statusText = document.getElementById('statusText');
-        const visualizerLabel = document.getElementById('visualizerLabel');
-        
-        // Initialize on page load
-        window.addEventListener('load', () => {
-            // Initialize audio context
-            setupAudioContext();
-            
-            // Setup visualization 
-            setupVisualizer();
-            
-            // Connect to Socket.IO server
-            connectSocketIO();
-            
-            // Add event listeners
-            streamButton.addEventListener('click', toggleStreaming);
-            clearButton.addEventListener('click', clearConversation);
-        });
-        
-        // Setup audio context
-        function setupAudioContext() {
-            try {
-                audioContext = new (window.AudioContext || window.webkitAudioContext)();
-                console.log('Audio context initialized');
-            } catch (err) {
-                console.error('Error setting up audio context:', err);
-                addSystemMessage(`Audio context error: ${err.message}`);
-                streamButton.disabled = true;
-            }
-        }
-        
-        // Setup the audio visualizer
-        function setupVisualizer() {
-            visualizerCanvas = document.getElementById('audioVisualizer');
-            canvasContext = visualizerCanvas.getContext('2d');
-            
-            // Set canvas size to match container
-            function resizeCanvas() {
-                const container = visualizerCanvas.parentElement;
-                visualizerCanvas.width = container.clientWidth;
-                visualizerCanvas.height = container.clientHeight;
-            }
-            
-            // Call initially and on window resize
-            resizeCanvas();
-            window.addEventListener('resize', resizeCanvas);
-            
-            // Create placeholder data array
-            visualizerBufferLength = 128;
-            visualizerDataArray = new Uint8Array(visualizerBufferLength);
-        }
-        
-        // Connect to Socket.IO server
-        function connectSocketIO() {
-            // Use the server URL with or without a specific port
-            const serverUrl = window.location.origin;
-            
-            console.log(`Connecting to Socket.IO server at ${serverUrl}`);
-            socket = io(serverUrl, {
-                reconnectionDelay: 1000,
-                reconnectionDelayMax: 5000,
-                reconnectionAttempts: Infinity
-            });
-            
-            // Socket.IO event handlers
-            socket.on('connect', () => {
-                console.log('Connected to Socket.IO server');
-                statusDot.classList.add('active');
-                statusText.textContent = 'Connected';
-                addSystemMessage('Connected to server');
-                streamButton.disabled = false;
-            });
-            
-            socket.on('disconnect', () => {
-                console.log('Disconnected from Socket.IO server');
-                statusDot.classList.remove('active');
-                statusText.textContent = 'Disconnected';
-                addSystemMessage('Disconnected from server');
-                streamButton.disabled = true;
-                
-                // Stop streaming if active
-                if (isStreaming) {
-                    stopStreaming(false); // false = don't send to server
-                }
-            });
-            
-            socket.on('status', (data) => {
-                console.log('Status update:', data);
-                addSystemMessage(data.message);
-            });
-            
-            socket.on('error', (data) => {
-                console.error('Server error:', data);
-                addSystemMessage(`Error: ${data.message}`);
-            });
-            
-            socket.on('audio_response', (data) => {
-                console.log('Received audio response');
-                
-                // Play audio response
-                const audio = new Audio(data.audio);
-                audio.play();
-                
-                // Add message to conversation
-                addAIMessage(data.text || 'AI response', data.audio);
-                
-                // Reset UI state after AI response
-                if (isStreaming) {
-                    streamButton.textContent = 'Listening...';
-                    streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
-                    streamButton.style.backgroundColor = '#f44336';
-                    streamButton.classList.add('recording');
-                    streamButton.classList.remove('processing');
-                    isSpeaking = false; // Reset speaking state
-                }
-            });
-            
-            socket.on('transcription', (data) => {
-                console.log('Received transcription:', data);
-                addUserTranscription(data.text);
-            });
-            
-            socket.on('context_updated', (data) => {
-                console.log('Context updated:', data);
-                addSystemMessage(data.message);
-            });
-            
-            socket.on('streaming_status', (data) => {
-                console.log('Streaming status:', data);
-                addSystemMessage(`Streaming ${data.status}`);
-            });
-            
-            socket.on('connect_error', (error) => {
-                console.error('Connection error:', error);
-                statusDot.classList.remove('active');
-                statusText.textContent = 'Connection Error';
-                addSystemMessage('Failed to connect to server');
-                streamButton.disabled = true;
-            });
-        }
-        
-        // Toggle streaming
-        function toggleStreaming() {
-            if (isStreaming) {
-                stopStreaming(true); // true = send to server
-            } else {
-                startStreaming();
-            }
-        }
-        
-        // Start streaming
-        async function startStreaming() {
-            try {
-                // Request microphone access
-                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-                const speaker = parseInt(speakerSelectEl.value);
-                
-                // Update state
-                isStreaming = true;
-                isSpeaking = false;
-                energyWindow = [];
-                
-                // Update UI
-                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
-                streamButton.classList.add('recording');
-                
-                // Setup audio analysis
-                const source = audioContext.createMediaStreamSource(stream);
-                
-                // Setup analyzer for visualization
-                analyser = audioContext.createAnalyser();
-                analyser.fftSize = 256;
-                analyser.smoothingTimeConstant = 0.8;
-                analyser.minDecibels = -90;
-                analyser.maxDecibels = -10;
-                
-                visualizerBufferLength = analyser.frequencyBinCount;
-                visualizerDataArray = new Uint8Array(visualizerBufferLength);
-                
-                // Connect source to analyzer
-                source.connect(analyser);
-                
-                // Hide visualizer label
-                visualizerLabel.style.opacity = '0';
-                
-                // Start visualization
-                if (visualizerAnimationFrame) {
-                    cancelAnimationFrame(visualizerAnimationFrame);
-                }
-                drawVisualizer();
-                
-                // Setup audio processor
-                streamProcessor = audioContext.createScriptProcessor(4096, 1, 1);
-                
-                // Connect audio nodes
-                source.connect(streamProcessor);
-                streamProcessor.connect(audioContext.destination);
-                
-                // Process audio
-                streamProcessor.onaudioprocess = function(e) {
-                    const audioData = e.inputBuffer.getChannelData(0);
-                    
-                    // Calculate energy (volume) for silence detection
-                    const energy = calculateAudioEnergy(audioData);
-                    updateEnergyWindow(energy);
-                    
-                    // Check if currently silent
-                    const avgEnergy = calculateAverageEnergy();
-                    const isSilent = avgEnergy < CLIENT_SILENCE_THRESHOLD;
-                    
-                    // Handle silence/speech transitions
-                    handleSpeechState(isSilent);
-                    
-                    // Process and send audio
-                    const downsampled = downsampleBuffer(audioData, audioContext.sampleRate, 24000);
-                    sendAudioChunk(downsampled, speaker);
-                };
-                
-                addSystemMessage('Listening - speak naturally and pause when finished');
-                
-            } catch (err) {
-                console.error('Error starting audio stream:', err);
-                addSystemMessage(`Microphone error: ${err.message}`);
-                isStreaming = false;
-                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
-                streamButton.classList.remove('recording', 'processing');
-            }
-        }
-        
-        // Stop streaming
-        function stopStreaming(sendToServer = true) {
-            // Disconnect audio nodes
-            if (streamProcessor) {
-                streamProcessor.disconnect();
-                streamProcessor = null;
-            }
-            
-            if (analyser) {
-                analyser.disconnect();
-                analyser = null;
-            }
-            
-            // Stop visualization
-            if (visualizerAnimationFrame) {
-                cancelAnimationFrame(visualizerAnimationFrame);
-                visualizerAnimationFrame = null;
-            }
-            
-            // Clear canvas
-            if (canvasContext) {
-                canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
-                visualizerLabel.style.opacity = '0.7';
-            }
-            
-            // Clear silence timer
-            if (silenceTimer) {
-                clearTimeout(silenceTimer);
-                silenceTimer = null;
-            }
-            
-            // Reset state
-            isStreaming = false;
-            isSpeaking = false;
-            energyWindow = [];
-            
-            // Update UI
-            streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
-            streamButton.classList.remove('recording', 'processing');
-            streamButton.style.backgroundColor = '';
-            
-            addSystemMessage('Conversation paused');
-            
-            // Notify server
-            if (sendToServer && socket.connected) {
-                socket.emit('stop_streaming', {
-                    speaker: parseInt(speakerSelectEl.value)
-                });
-            }
-        }
-        
-        // Clear conversation
-        function clearConversation() {
-            // Clear UI
-            conversationEl.innerHTML = '';
-            addSystemMessage('Conversation cleared');
-            
-            // Notify server
-            if (socket.connected) {
-                socket.emit('clear_context');
-            }
-        }
-        
-        // Calculate audio energy (volume)
-        function calculateAudioEnergy(buffer) {
-            let sum = 0;
-            for (let i = 0; i < buffer.length; i++) {
-                sum += Math.abs(buffer[i]);
-            }
-            return sum / buffer.length;
-        }
-        
-        // Update energy window
-        function updateEnergyWindow(energy) {
-            energyWindow.push(energy);
-            if (energyWindow.length > ENERGY_WINDOW_SIZE) {
-                energyWindow.shift();
-            }
-        }
-        
-        // Calculate average energy
-        function calculateAverageEnergy() {
-            if (energyWindow.length === 0) return 0;
-            return energyWindow.reduce((sum, val) => sum + val, 0) / energyWindow.length;
-        }
-        
-        // Handle speech state changes
-        function handleSpeechState(isSilent) {
-            if (isSpeaking && isSilent) {
-                // Transition from speaking to silence
-                if (!silenceTimer) {
-                    silenceTimer = setTimeout(() => {
-                        // Silence persisted long enough
-                        streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
-                        streamButton.classList.remove('recording');
-                        streamButton.classList.add('processing');
-                        addSystemMessage('Detected pause in speech, processing response...');
-                    }, CLIENT_SILENCE_DURATION_MS);
-                }
-            } else if (!isSpeaking && !isSilent) {
-                // Transition from silence to speaking
-                isSpeaking = true;
-                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
-                streamButton.classList.add('recording');
-                streamButton.classList.remove('processing');
-                
-                // Clear silence timer
-                if (silenceTimer) {
-                    clearTimeout(silenceTimer);
-                    silenceTimer = null;
-                }
-            } else if (isSpeaking && !isSilent) {
-                // Still speaking, reset silence timer
-                if (silenceTimer) {
-                    clearTimeout(silenceTimer);
-                    silenceTimer = null;
-                }
-            }
-            
-            // Update speaking state for non-silent audio
-            if (!isSilent) {
-                isSpeaking = true;
-            }
-        }
-        
-        // Send audio chunk to server
-        function sendAudioChunk(audioData, speaker) {
-            if (!socket || !socket.connected) {
-                console.warn('Cannot send audio: socket not connected');
-                return;
-            }
-            
-            const wavData = createWavBlob(audioData, 24000);
-            const reader = new FileReader();
-            
-            reader.onloadend = function() {
-                const base64data = reader.result;
-                
-                // Send to server using Socket.IO
-                socket.emit('stream_audio', {
-                    speaker: speaker,
-                    audio: base64data
-                });
-            };
-            
-            reader.readAsDataURL(wavData);
-        }
-        
-        // Visualization function
-        function drawVisualizer() {
-            if (!canvasContext) {
-                console.error("Canvas context not available");
-                return;
-            }
-            
-            visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
-            
-            // Get frequency data if available
-            if (isStreaming && analyser) {
-                try {
-                    analyser.getByteFrequencyData(visualizerDataArray);
-                } catch (e) {
-                    console.error("Error getting frequency data:", e);
-                }
-            } else {
-                // Fade out when not streaming
-                for (let i = 0; i < visualizerDataArray.length; i++) {
-                    visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
-                }
-            }
-            
-            // Clear canvas
-            canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
-            canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
-            
-            // Draw bars
-            const width = visualizerCanvas.width;
-            const height = visualizerCanvas.height;
-            const barCount = Math.min(visualizerBufferLength, 64);
-            const barWidth = width / barCount - 1;
-            
-            for (let i = 0; i < barCount; i++) {
-                const index = Math.floor(i * visualizerBufferLength / barCount);
-                const value = visualizerDataArray[index];
-                
-                const barHeight = (value / 255) * height;
-                const x = i * (barWidth + 1);
-                
-                // Color based on frequency
-                const hue = 200 + (i / barCount * 60);
-                const saturation = 90 - (value / 255 * 30);
-                const lightness = 40 + (value / 255 * 30);
-                
-                // Draw bar
-                canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
-                canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
-                
-                // Add reflection effect
-                const gradientHeight = Math.min(10, barHeight / 3);
-                const gradient = canvasContext.createLinearGradient(
-                    0, height - barHeight, 
-                    0, height - barHeight + gradientHeight
-                );
-                gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
-                gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
-                canvasContext.fillStyle = gradient;
-                canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
-            }
-            
-            // Show/hide the label
-            visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
-        }
-        
-        // Downsample audio buffer
-        function downsampleBuffer(buffer, sampleRate, targetSampleRate) {
-            if (targetSampleRate === sampleRate) {
-                return buffer;
-            }
-            
-            const sampleRateRatio = sampleRate / targetSampleRate;
-            const newLength = Math.round(buffer.length / sampleRateRatio);
-            const result = new Float32Array(newLength);
-            
-            let offsetResult = 0;
-            let offsetBuffer = 0;
-            
-            while (offsetResult < result.length) {
-                const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
-                let accum = 0, count = 0;
-                
-                for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
-                    accum += buffer[i];
-                    count++;
-                }
-                
-                result[offsetResult] = accum / count;
-                offsetResult++;
-                offsetBuffer = nextOffsetBuffer;
-            }
-            
-            return result;
-        }
-        
-        // Create WAV blob
-        function createWavBlob(samples, sampleRate) {
-            const buffer = new ArrayBuffer(44 + samples.length * 2);
-            const view = new DataView(buffer);
-            
-            // RIFF chunk descriptor
-            writeString(view, 0, 'RIFF');
-            view.setUint32(4, 36 + samples.length * 2, true);
-            writeString(view, 8, 'WAVE');
-            
-            // fmt sub-chunk
-            writeString(view, 12, 'fmt ');
-            view.setUint32(16, 16, true);
-            view.setUint16(20, 1, true);     // PCM format
-            view.setUint16(22, 1, true);     // Mono channel
-            view.setUint32(24, sampleRate, true);
-            view.setUint32(28, sampleRate * 2, true);
-            view.setUint16(32, 2, true);
-            view.setUint16(34, 16, true);
-            
-            // data sub-chunk
-            writeString(view, 36, 'data');
-            view.setUint32(40, samples.length * 2, true);
-            
-            // Write PCM samples
-            for (let i = 0; i < samples.length; i++) {
-                const sample = Math.max(-1, Math.min(1, samples[i]));
-                view.setInt16(44 + i * 2, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true);
-            }
-            
-            return new Blob([buffer], { type: 'audio/wav' });
-        }
-        
-        // Write string to DataView
-        function writeString(view, offset, string) {
-            for (let i = 0; i < string.length; i++) {
-                view.setUint8(offset + i, string.charCodeAt(i));
-            }
-        }
-        
-        // Add user transcription
-        function addUserTranscription(text) {
-            // Find or create user message
-            let pendingMessage = document.querySelector('.message.user.pending');
-            
-            if (!pendingMessage) {
-                pendingMessage = document.createElement('div');
-                pendingMessage.classList.add('message', 'user', 'pending');
-                conversationEl.appendChild(pendingMessage);
-            }
-            
-            pendingMessage.textContent = text;
-            pendingMessage.classList.remove('pending');
-            conversationEl.scrollTop = conversationEl.scrollHeight;
-        }
-        
-        // Add AI message
-        function addAIMessage(text, audioSrc) {
-            const messageEl = document.createElement('div');
-            messageEl.classList.add('message', 'ai');
-            
-            if (text) {
-                const textDiv = document.createElement('div');
-                textDiv.textContent = text;
-                messageEl.appendChild(textDiv);
-            }
-            
-            const audioEl = document.createElement('audio');
-            audioEl.controls = true;
-            audioEl.src = audioSrc;
-            messageEl.appendChild(audioEl);
-            
-            conversationEl.appendChild(messageEl);
-            conversationEl.scrollTop = conversationEl.scrollHeight;
-        }
-        
-        // Add system message
-        function addSystemMessage(text) {
-            const messageEl = document.createElement('div');
-            messageEl.classList.add('message', 'system');
-            messageEl.textContent = text;
-            conversationEl.appendChild(messageEl);
-            conversationEl.scrollTop = conversationEl.scrollHeight;
-        }
-    </script>
+    <footer>
+        <p>Powered by Sesame AI | WhisperX for speech recognition</p>
+    </footer>
+
+    <!-- Load our JavaScript file -->
+    <script src="voice-chat.js"></script>
 </body>
 </html>
\ No newline at end of file
diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js
new file mode 100644
index 0000000..0c8a815
--- /dev/null
+++ b/Backend/voice-chat.js
@@ -0,0 +1,795 @@
+/**
+ * Sesame AI Voice Chat Application
+ * 
+ * This script handles the audio streaming, visualization, 
+ * and Socket.IO communication for the voice chat application.
+ */
+
+// Application state
+const state = {
+    socket: null,
+    audioContext: null,
+    streamProcessor: null,
+    analyser: null,
+    microphone: null,
+    isStreaming: false,
+    isSpeaking: false,
+    silenceTimer: null,
+    energyWindow: [],
+    currentSpeaker: 0,
+    silenceThreshold: 0.01,
+    visualizerAnimationFrame: null,
+    volumeUpdateInterval: null,
+    connectionAttempts: 0
+};
+
+// Constants
+const ENERGY_WINDOW_SIZE = 10;
+const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second of silence before processing
+const MAX_CONNECTION_ATTEMPTS = 5;
+const RECONNECTION_DELAY_MS = 2000;
+
+// DOM elements
+const elements = {
+    conversation: document.getElementById('conversation'),
+    speakerSelect: document.getElementById('speakerSelect'),
+    streamButton: document.getElementById('streamButton'),
+    clearButton: document.getElementById('clearButton'),
+    statusDot: document.getElementById('statusDot'),
+    statusText: document.getElementById('statusText'),
+    visualizerCanvas: document.getElementById('audioVisualizer'),
+    visualizerLabel: document.getElementById('visualizerLabel'),
+    thresholdSlider: document.getElementById('thresholdSlider'),
+    thresholdValue: document.getElementById('thresholdValue'),
+    volumeLevel: document.getElementById('volumeLevel'),
+    autoPlayResponses: document.getElementById('autoPlayResponses'),
+    showVisualizer: document.getElementById('showVisualizer')
+};
+
+// Visualization variables
+let canvasContext;
+let visualizerBufferLength;
+let visualizerDataArray;
+
+// Initialize the application
+function initializeApp() {
+    // Set up event listeners
+    elements.streamButton.addEventListener('click', toggleStreaming);
+    elements.clearButton.addEventListener('click', clearConversation);
+    elements.thresholdSlider.addEventListener('input', updateThreshold);
+    elements.speakerSelect.addEventListener('change', () => {
+        state.currentSpeaker = parseInt(elements.speakerSelect.value);
+    });
+    elements.showVisualizer.addEventListener('change', toggleVisualizerVisibility);
+
+    // Initialize audio context
+    setupAudioContext();
+    
+    // Set up visualization
+    setupVisualizer();
+    
+    // Connect to Socket.IO server
+    connectToServer();
+
+    // Add welcome message
+    addSystemMessage('Welcome to Sesame AI Voice Chat! Click "Start Conversation" to begin speaking.');
+}
+
+// Connect to Socket.IO server
+function connectToServer() {
+    try {
+        // Use the server URL with or without a specific port
+        const serverUrl = window.location.origin;
+        
+        updateStatus('Connecting...', 'connecting');
+        console.log(`Connecting to Socket.IO server at ${serverUrl}`);
+        
+        state.socket = io(serverUrl, {
+            reconnectionDelay: RECONNECTION_DELAY_MS,
+            reconnectionDelayMax: 5000,
+            reconnectionAttempts: MAX_CONNECTION_ATTEMPTS
+        });
+        
+        setupSocketListeners();
+    } catch (error) {
+        console.error('Error connecting to server:', error);
+        updateStatus('Connection failed. Retrying...', 'error');
+        
+        // Try to reconnect
+        if (state.connectionAttempts < MAX_CONNECTION_ATTEMPTS) {
+            state.connectionAttempts++;
+            setTimeout(connectToServer, RECONNECTION_DELAY_MS);
+        } else {
+            updateStatus('Could not connect to server', 'error');
+            addSystemMessage('Failed to connect to the server. Please check your connection and refresh the page.');
+        }
+    }
+}
+
+// Set up Socket.IO event listeners
+function setupSocketListeners() {
+    if (!state.socket) return;
+    
+    state.socket.on('connect', () => {
+        console.log('Connected to Socket.IO server');
+        updateStatus('Connected', 'connected');
+        state.connectionAttempts = 0;
+        elements.streamButton.disabled = false;
+        addSystemMessage('Connected to server');
+    });
+    
+    state.socket.on('disconnect', () => {
+        console.log('Disconnected from Socket.IO server');
+        updateStatus('Disconnected', 'disconnected');
+        
+        // Stop streaming if active
+        if (state.isStreaming) {
+            stopStreaming(false); // false = don't send to server
+        }
+        
+        elements.streamButton.disabled = true;
+        addSystemMessage('Disconnected from server. Trying to reconnect...');
+    });
+    
+    state.socket.on('status', (data) => {
+        console.log('Status:', data);
+        addSystemMessage(data.message);
+    });
+    
+    state.socket.on('error', (data) => {
+        console.error('Server error:', data);
+        addSystemMessage(`Error: ${data.message}`);
+    });
+    
+    state.socket.on('audio_response', handleAudioResponse);
+    state.socket.on('transcription', handleTranscription);
+    state.socket.on('context_updated', handleContextUpdate);
+    state.socket.on('streaming_status', handleStreamingStatus);
+    
+    state.socket.on('connect_error', (error) => {
+        console.error('Connection error:', error);
+        updateStatus('Connection Error', 'error');
+    });
+}
+
+// Update the connection status in the UI
+function updateStatus(message, status) {
+    elements.statusText.textContent = message;
+    elements.statusDot.className = 'status-dot';
+    
+    if (status === 'connected') {
+        elements.statusDot.classList.add('active');
+    } else if (status === 'connecting') {
+        elements.statusDot.style.backgroundColor = '#FFA500';
+    } else if (status === 'error') {
+        elements.statusDot.style.backgroundColor = '#F44336';
+    }
+}
+
+// Set up audio context
+function setupAudioContext() {
+    try {
+        state.audioContext = new (window.AudioContext || window.webkitAudioContext)();
+        console.log('Audio context initialized');
+    } catch (err) {
+        console.error('Error setting up audio context:', err);
+        addSystemMessage(`Audio context error: ${err.message}`);
+        elements.streamButton.disabled = true;
+    }
+}
+
+// Set up audio visualizer
+function setupVisualizer() {
+    canvasContext = elements.visualizerCanvas.getContext('2d');
+    
+    // Set canvas size to match container
+    function resizeCanvas() {
+        const container = elements.visualizerCanvas.parentElement;
+        elements.visualizerCanvas.width = container.clientWidth;
+        elements.visualizerCanvas.height = container.clientHeight;
+    }
+    
+    // Call initially and on window resize
+    resizeCanvas();
+    window.addEventListener('resize', resizeCanvas);
+    
+    // Create placeholder data array
+    visualizerBufferLength = 128;
+    visualizerDataArray = new Uint8Array(visualizerBufferLength);
+}
+
+// Toggle stream on/off
+function toggleStreaming() {
+    if (state.isStreaming) {
+        stopStreaming(true); // true = send to server
+    } else {
+        startStreaming();
+    }
+}
+
+// Start streaming audio to the server
+async function startStreaming() {
+    if (!state.socket || !state.socket.connected) {
+        addSystemMessage('Cannot start conversation: Not connected to server');
+        return;
+    }
+    
+    try {
+        // Request microphone access
+        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+        
+        // Update state
+        state.isStreaming = true;
+        state.isSpeaking = false;
+        state.energyWindow = [];
+        state.currentSpeaker = parseInt(elements.speakerSelect.value);
+        
+        // Update UI
+        elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
+        elements.streamButton.classList.add('recording');
+        elements.visualizerLabel.style.opacity = '0';
+        
+        // Set up audio processing
+        setupAudioProcessing(stream);
+        
+        // Start volume meter updates
+        state.volumeUpdateInterval = setInterval(updateVolumeMeter, 100);
+        
+        addSystemMessage('Listening - speak naturally and pause when finished');
+        
+    } catch (err) {
+        console.error('Error starting audio stream:', err);
+        addSystemMessage(`Microphone error: ${err.message}`);
+        cleanupAudioResources();
+    }
+}
+
+// Set up audio processing pipeline
+function setupAudioProcessing(stream) {
+    // Store microphone stream for later cleanup
+    state.microphone = stream;
+    
+    // Create source from microphone
+    const source = state.audioContext.createMediaStreamSource(stream);
+    
+    // Setup analyzer for visualization
+    state.analyser = state.audioContext.createAnalyser();
+    state.analyser.fftSize = 256;
+    state.analyser.smoothingTimeConstant = 0.8;
+    state.analyser.minDecibels = -90;
+    state.analyser.maxDecibels = -10;
+    
+    visualizerBufferLength = state.analyser.frequencyBinCount;
+    visualizerDataArray = new Uint8Array(visualizerBufferLength);
+    
+    // Connect source to analyzer
+    source.connect(state.analyser);
+    
+    // Start visualization
+    if (state.visualizerAnimationFrame) {
+        cancelAnimationFrame(state.visualizerAnimationFrame);
+    }
+    drawVisualizer();
+    
+    // Setup audio processor
+    state.streamProcessor = state.audioContext.createScriptProcessor(4096, 1, 1);
+    
+    // Connect audio nodes
+    source.connect(state.streamProcessor);
+    state.streamProcessor.connect(state.audioContext.destination);
+    
+    // Process audio
+    state.streamProcessor.onaudioprocess = handleAudioProcess;
+}
+
+// Handle each frame of audio data
+function handleAudioProcess(e) {
+    const audioData = e.inputBuffer.getChannelData(0);
+    
+    // Calculate energy (volume) for silence detection
+    const energy = calculateAudioEnergy(audioData);
+    updateEnergyWindow(energy);
+    
+    // Check if currently silent
+    const avgEnergy = calculateAverageEnergy();
+    const isSilent = avgEnergy < state.silenceThreshold;
+    
+    // Handle silence/speech transitions
+    handleSpeechState(isSilent);
+    
+    // Process and send audio
+    const downsampled = downsampleBuffer(audioData, state.audioContext.sampleRate, 24000);
+    sendAudioChunk(downsampled, state.currentSpeaker);
+}
+
+// Stop streaming audio
+function stopStreaming(sendToServer = true) {
+    // Cleanup audio resources
+    cleanupAudioResources();
+    
+    // Reset state
+    state.isStreaming = false;
+    state.isSpeaking = false;
+    state.energyWindow = [];
+    
+    // Update UI
+    elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
+    elements.streamButton.classList.remove('recording', 'processing');
+    elements.streamButton.style.backgroundColor = '';
+    elements.volumeLevel.style.width = '100%';
+    
+    // Clear volume meter updates
+    if (state.volumeUpdateInterval) {
+        clearInterval(state.volumeUpdateInterval);
+        state.volumeUpdateInterval = null;
+    }
+    
+    addSystemMessage('Conversation paused');
+    
+    // Notify server
+    if (sendToServer && state.socket && state.socket.connected) {
+        state.socket.emit('stop_streaming', {
+            speaker: state.currentSpeaker
+        });
+    }
+}
+
+// Clean up audio processing resources
+function cleanupAudioResources() {
+    // Stop microphone stream
+    if (state.microphone) {
+        state.microphone.getTracks().forEach(track => track.stop());
+        state.microphone = null;
+    }
+    
+    // Disconnect audio processor
+    if (state.streamProcessor) {
+        state.streamProcessor.disconnect();
+        state.streamProcessor.onaudioprocess = null;
+        state.streamProcessor = null;
+    }
+    
+    // Disconnect analyzer
+    if (state.analyser) {
+        state.analyser.disconnect();
+        state.analyser = null;
+    }
+    
+    // Cancel visualizer animation
+    if (state.visualizerAnimationFrame) {
+        cancelAnimationFrame(state.visualizerAnimationFrame);
+        state.visualizerAnimationFrame = null;
+    }
+    
+    // Cancel silence timer
+    if (state.silenceTimer) {
+        clearTimeout(state.silenceTimer);
+        state.silenceTimer = null;
+    }
+    
+    // Reset visualizer display
+    if (canvasContext) {
+        canvasContext.clearRect(0, 0, elements.visualizerCanvas.width, elements.visualizerCanvas.height);
+        elements.visualizerLabel.style.opacity = '0.7';
+    }
+}
+
+// Clear conversation history
+function clearConversation() {
+    // Clear UI
+    elements.conversation.innerHTML = '';
+    addSystemMessage('Conversation cleared');
+    
+    // Notify server
+    if (state.socket && state.socket.connected) {
+        state.socket.emit('clear_context');
+    }
+}
+
+// Calculate audio energy (volume)
+function calculateAudioEnergy(buffer) {
+    let sum = 0;
+    for (let i = 0; i < buffer.length; i++) {
+        sum += Math.abs(buffer[i]);
+    }
+    return sum / buffer.length;
+}
+
+// Update energy window for averaging
+function updateEnergyWindow(energy) {
+    state.energyWindow.push(energy);
+    if (state.energyWindow.length > ENERGY_WINDOW_SIZE) {
+        state.energyWindow.shift();
+    }
+}
+
+// Calculate average energy from window
+function calculateAverageEnergy() {
+    if (state.energyWindow.length === 0) return 0;
+    return state.energyWindow.reduce((sum, val) => sum + val, 0) / state.energyWindow.length;
+}
+
+// Update the threshold from the slider
+function updateThreshold() {
+    state.silenceThreshold = parseFloat(elements.thresholdSlider.value);
+    elements.thresholdValue.textContent = state.silenceThreshold.toFixed(3);
+}
+
+// Update the volume meter display
+function updateVolumeMeter() {
+    if (!state.isStreaming || !state.analyser) return;
+    
+    // Get current volume level
+    const dataArray = new Uint8Array(state.analyser.frequencyBinCount);
+    state.analyser.getByteFrequencyData(dataArray);
+    
+    // Calculate average volume
+    let sum = 0;
+    for (let i = 0; i < dataArray.length; i++) {
+        sum += dataArray[i];
+    }
+    const average = sum / dataArray.length;
+    
+    // Normalize to 0-100%
+    const percentage = Math.min(100, Math.max(0, average / 128 * 100));
+    
+    // Invert because we're showing the "empty" portion
+    elements.volumeLevel.style.width = (100 - percentage) + '%';
+    
+    // Change color based on level
+    if (percentage > 70) {
+        elements.volumeLevel.style.backgroundColor = 'rgba(244, 67, 54, 0.5)'; // Red
+    } else if (percentage > 30) {
+        elements.volumeLevel.style.backgroundColor = 'rgba(255, 235, 59, 0.5)'; // Yellow
+    } else {
+        elements.volumeLevel.style.backgroundColor = 'rgba(0, 0, 0, 0.5)'; // Dark
+    }
+}
+
+// Handle speech/silence state transitions
+function handleSpeechState(isSilent) {
+    if (state.isSpeaking && isSilent) {
+        // Transition from speaking to silence
+        if (!state.silenceTimer) {
+            state.silenceTimer = setTimeout(() => {
+                // Silence persisted long enough - process the audio
+                elements.streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
+                elements.streamButton.classList.remove('recording');
+                elements.streamButton.classList.add('processing');
+                addSystemMessage('Detected pause in speech, processing response...');
+            }, CLIENT_SILENCE_DURATION_MS);
+        }
+    } else if (!state.isSpeaking && !isSilent) {
+        // Transition from silence to speaking
+        state.isSpeaking = true;
+        elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
+        elements.streamButton.classList.add('recording');
+        elements.streamButton.classList.remove('processing');
+        
+        // Clear silence timer
+        if (state.silenceTimer) {
+            clearTimeout(state.silenceTimer);
+            state.silenceTimer = null;
+        }
+    } else if (state.isSpeaking && !isSilent) {
+        // Still speaking, reset silence timer
+        if (state.silenceTimer) {
+            clearTimeout(state.silenceTimer);
+            state.silenceTimer = null;
+        }
+    }
+    
+    // Update speaking state for non-silent audio
+    if (!isSilent) {
+        state.isSpeaking = true;
+    }
+}
+
+// Send audio chunk to server
+function sendAudioChunk(audioData, speaker) {
+    if (!state.socket || !state.socket.connected) {
+        console.warn('Cannot send audio: socket not connected');
+        return;
+    }
+    
+    const wavData = createWavBlob(audioData, 24000);
+    const reader = new FileReader();
+    
+    reader.onloadend = function() {
+        const base64data = reader.result;
+        
+        // Send to server using Socket.IO
+        state.socket.emit('stream_audio', {
+            speaker: speaker,
+            audio: base64data
+        });
+    };
+    
+    reader.readAsDataURL(wavData);
+}
+
+// Draw audio visualizer
+function drawVisualizer() {
+    if (!canvasContext) {
+        return;
+    }
+    
+    state.visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
+    
+    // Skip drawing if visualizer is hidden
+    if (!elements.showVisualizer.checked) {
+        if (elements.visualizerCanvas.style.opacity !== '0') {
+            elements.visualizerCanvas.style.opacity = '0';
+        }
+        return;
+    } else if (elements.visualizerCanvas.style.opacity !== '1') {
+        elements.visualizerCanvas.style.opacity = '1';
+    }
+    
+    // Get frequency data if available
+    if (state.isStreaming && state.analyser) {
+        try {
+            state.analyser.getByteFrequencyData(visualizerDataArray);
+        } catch (e) {
+            console.error("Error getting frequency data:", e);
+        }
+    } else {
+        // Fade out when not streaming
+        for (let i = 0; i < visualizerDataArray.length; i++) {
+            visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
+        }
+    }
+    
+    // Clear canvas
+    canvasContext.fillStyle = 'rgb(0, 0, 0)';
+    canvasContext.fillRect(0, 0, elements.visualizerCanvas.width, elements.visualizerCanvas.height);
+    
+    // Draw gradient bars
+    const width = elements.visualizerCanvas.width;
+    const height = elements.visualizerCanvas.height;
+    const barCount = Math.min(visualizerBufferLength, 64);
+    const barWidth = width / barCount - 1;
+    
+    for (let i = 0; i < barCount; i++) {
+        const index = Math.floor(i * visualizerBufferLength / barCount);
+        const value = visualizerDataArray[index];
+        
+        // Use logarithmic scale for better audio visualization
+        // This makes low values more visible while still maintaining full range
+        const logFactor = 20;
+        const scaledValue = Math.log(1 + (value / 255) * logFactor) / Math.log(1 + logFactor);
+        const barHeight = scaledValue * height;
+        
+        // Position bars
+        const x = i * (barWidth + 1);
+        const y = height - barHeight;
+        
+        // Create color gradient based on frequency and amplitude
+        const hue = i / barCount * 360; // Full color spectrum
+        const saturation = 80 + (value / 255 * 20); // Higher values more saturated
+        const lightness = 40 + (value / 255 * 20); // Dynamic brightness based on amplitude
+        
+        // Draw main bar
+        canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
+        canvasContext.fillRect(x, y, barWidth, barHeight);
+        
+        // Add reflection effect
+        if (barHeight > 5) {
+            const gradient = canvasContext.createLinearGradient(
+                x, y, 
+                x, y + barHeight * 0.5
+            );
+            gradient.addColorStop(0, `hsla(${hue}, ${saturation}%, ${lightness + 20}%, 0.4)`);
+            gradient.addColorStop(1, `hsla(${hue}, ${saturation}%, ${lightness}%, 0)`);
+            canvasContext.fillStyle = gradient;
+            canvasContext.fillRect(x, y, barWidth, barHeight * 0.5);
+            
+            // Add highlight on top of the bar for better 3D effect
+            canvasContext.fillStyle = `hsla(${hue}, ${saturation - 20}%, ${lightness + 30}%, 0.7)`;
+            canvasContext.fillRect(x, y, barWidth, 2);
+        }
+    }
+    
+    // Show/hide the label
+    elements.visualizerLabel.style.opacity = (state.isStreaming) ? '0' : '0.7';
+}
+
+// Toggle visualizer visibility
+function toggleVisualizerVisibility() {
+    const isVisible = elements.showVisualizer.checked;
+    elements.visualizerCanvas.style.opacity = isVisible ? '1' : '0';
+    
+    if (isVisible && state.isStreaming && !state.visualizerAnimationFrame) {
+        drawVisualizer();
+    }
+}
+
+// Handle audio response from server
+function handleAudioResponse(data) {
+    console.log('Received audio response');
+    
+    // Create message container
+    const messageElement = document.createElement('div');
+    messageElement.className = 'message ai';
+    
+    // Add text content if available
+    if (data.text) {
+        const textElement = document.createElement('p');
+        textElement.textContent = data.text;
+        messageElement.appendChild(textElement);
+    }
+    
+    // Create and configure audio element
+    const audioElement = document.createElement('audio');
+    audioElement.controls = true;
+    audioElement.className = 'audio-player';
+    
+    // Set audio source
+    const audioSource = document.createElement('source');
+    audioSource.src = data.audio;
+    audioSource.type = 'audio/wav';
+    
+    // Add fallback text
+    audioElement.textContent = 'Your browser does not support the audio element.';
+    
+    // Assemble audio element
+    audioElement.appendChild(audioSource);
+    messageElement.appendChild(audioElement);
+    
+    // Add timestamp
+    const timeElement = document.createElement('span');
+    timeElement.className = 'message-time';
+    timeElement.textContent = new Date().toLocaleTimeString();
+    messageElement.appendChild(timeElement);
+    
+    // Add to conversation
+    elements.conversation.appendChild(messageElement);
+    
+    // Auto-scroll to bottom
+    elements.conversation.scrollTop = elements.conversation.scrollHeight;
+    
+    // Auto-play if enabled
+    if (elements.autoPlayResponses.checked) {
+        audioElement.play()
+            .catch(err => {
+                console.warn('Auto-play failed:', err);
+                addSystemMessage('Auto-play failed. Please click play to hear the response.');
+            });
+    }
+    
+    // Re-enable stream button after processing is complete
+    if (state.isStreaming) {
+        elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
+        elements.streamButton.classList.add('recording');
+        elements.streamButton.classList.remove('processing');
+    }
+}
+
+// Handle transcription response from server
+function handleTranscription(data) {
+    console.log('Received transcription:', data.text);
+    
+    // Create message element
+    const messageElement = document.createElement('div');
+    messageElement.className = 'message user';
+    
+    // Add text content
+    const textElement = document.createElement('p');
+    textElement.textContent = data.text;
+    messageElement.appendChild(textElement);
+    
+    // Add timestamp
+    const timeElement = document.createElement('span');
+    timeElement.className = 'message-time';
+    timeElement.textContent = new Date().toLocaleTimeString();
+    messageElement.appendChild(timeElement);
+    
+    // Add to conversation
+    elements.conversation.appendChild(messageElement);
+    
+    // Auto-scroll to bottom
+    elements.conversation.scrollTop = elements.conversation.scrollHeight;
+}
+
+// Handle context update from server
+function handleContextUpdate(data) {
+    console.log('Context updated:', data.message);
+}
+
+// Handle streaming status updates from server
+function handleStreamingStatus(data) {
+    console.log('Streaming status:', data.status);
+    
+    if (data.status === 'stopped') {
+        // Reset UI if needed
+        if (state.isStreaming) {
+            stopStreaming(false); // Don't send to server since this came from server
+        }
+    }
+}
+
+// Add a system message to the conversation
+function addSystemMessage(message) {
+    const messageElement = document.createElement('div');
+    messageElement.className = 'message system';
+    messageElement.textContent = message;
+    elements.conversation.appendChild(messageElement);
+    
+    // Auto-scroll to bottom
+    elements.conversation.scrollTop = elements.conversation.scrollHeight;
+}
+
+// Create WAV blob from audio data
+function createWavBlob(audioData, sampleRate) {
+    // Function to convert Float32Array to Int16Array for WAV format
+    function floatTo16BitPCM(output, offset, input) {
+        for (let i = 0; i < input.length; i++, offset += 2) {
+            const s = Math.max(-1, Math.min(1, input[i]));
+            output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
+        }
+    }
+    
+    // Create WAV header
+    function writeString(view, offset, string) {
+        for (let i = 0; i < string.length; i++) {
+            view.setUint8(offset + i, string.charCodeAt(i));
+        }
+    }
+    
+    // Create WAV file with header
+    function encodeWAV(samples) {
+        const buffer = new ArrayBuffer(44 + samples.length * 2);
+        const view = new DataView(buffer);
+        
+        // RIFF chunk descriptor
+        writeString(view, 0, 'RIFF');
+        view.setUint32(4, 36 + samples.length * 2, true);
+        writeString(view, 8, 'WAVE');
+        
+        // fmt sub-chunk
+        writeString(view, 12, 'fmt ');
+        view.setUint32(16, 16, true);
+        view.setUint16(20, 1, true); // PCM format
+        view.setUint16(22, 1, true); // Mono channel
+        view.setUint32(24, sampleRate, true);
+        view.setUint32(28, sampleRate * 2, true); // Byte rate
+        view.setUint16(32, 2, true); // Block align
+        view.setUint16(34, 16, true); // Bits per sample
+        
+        // data sub-chunk
+        writeString(view, 36, 'data');
+        view.setUint32(40, samples.length * 2, true);
+        floatTo16BitPCM(view, 44, samples);
+        
+        return buffer;
+    }
+    
+    // Convert audio data to TypedArray if it's a regular Array
+    const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
+    
+    // Create WAV blob
+    const wavBuffer = encodeWAV(samples);
+    return new Blob([wavBuffer], { type: 'audio/wav' });
+}
+
+// Downsample audio buffer to target sample rate
+function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
+    if (originalSampleRate === targetSampleRate) {
+        return buffer;
+    }
+    
+    const ratio = originalSampleRate / targetSampleRate;
+    const newLength = Math.round(buffer.length / ratio);
+    const result = new Float32Array(newLength);
+    
+    for (let i = 0; i < newLength; i++) {
+        const pos = Math.round(i * ratio);
+        result[i] = buffer[pos];
+    }
+    
+    return result;
+}
+
+// Initialize the application when DOM is fully loaded
+document.addEventListener('DOMContentLoaded', initializeApp);
+

From 9ca259aab3e6c16060f8a2343db5bd76b50230ad Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 23:22:45 -0400
Subject: [PATCH 08/16] Demo Update 2

---
 Backend/index.html    |   2 +-
 Backend/voice-chat.js | 708 ++++++++++++++++++++++--------------------
 2 files changed, 374 insertions(+), 336 deletions(-)

diff --git a/Backend/index.html b/Backend/index.html
index cbb4172..5ea925c 100644
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -487,6 +487,6 @@
     </footer>
 
     <!-- Load our JavaScript file -->
-    <script src="voice-chat.js"></script>
+    <script src="./voice-chat.js"></script>
 </body>
 </html>
\ No newline at end of file
diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js
index 0c8a815..a4e10f5 100644
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -1,388 +1,445 @@
 /**
- * Sesame AI Voice Chat Application
+ * Sesame AI Voice Chat Client
  * 
- * This script handles the audio streaming, visualization, 
- * and Socket.IO communication for the voice chat application.
+ * A web client that connects to a Sesame AI voice chat server and enables 
+ * real-time voice conversation with an AI assistant.
  */
 
+// Configuration constants
+const SERVER_URL = window.location.hostname === 'localhost' ? 
+    'http://localhost:5000' : window.location.origin;
+const ENERGY_WINDOW_SIZE = 15;
+const CLIENT_SILENCE_DURATION_MS = 750;
+
+// DOM elements
+const elements = {
+    conversation: null,
+    streamButton: null,
+    clearButton: null,
+    thresholdSlider: null,
+    thresholdValue: null,
+    visualizerCanvas: null,
+    visualizerLabel: null,
+    volumeLevel: null,
+    statusDot: null,
+    statusText: null,
+    speakerSelection: null,
+    autoPlayResponses: null,
+    showVisualizer: null
+};
+
 // Application state
 const state = {
     socket: null,
     audioContext: null,
-    streamProcessor: null,
     analyser: null,
     microphone: null,
+    streamProcessor: null,
     isStreaming: false,
     isSpeaking: false,
-    silenceTimer: null,
-    energyWindow: [],
-    currentSpeaker: 0,
     silenceThreshold: 0.01,
-    visualizerAnimationFrame: null,
+    energyWindow: [],
+    silenceTimer: null,
     volumeUpdateInterval: null,
-    connectionAttempts: 0
+    visualizerAnimationFrame: null,
+    currentSpeaker: 0
 };
 
-// Constants
-const ENERGY_WINDOW_SIZE = 10;
-const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second of silence before processing
-const MAX_CONNECTION_ATTEMPTS = 5;
-const RECONNECTION_DELAY_MS = 2000;
-
-// DOM elements
-const elements = {
-    conversation: document.getElementById('conversation'),
-    speakerSelect: document.getElementById('speakerSelect'),
-    streamButton: document.getElementById('streamButton'),
-    clearButton: document.getElementById('clearButton'),
-    statusDot: document.getElementById('statusDot'),
-    statusText: document.getElementById('statusText'),
-    visualizerCanvas: document.getElementById('audioVisualizer'),
-    visualizerLabel: document.getElementById('visualizerLabel'),
-    thresholdSlider: document.getElementById('thresholdSlider'),
-    thresholdValue: document.getElementById('thresholdValue'),
-    volumeLevel: document.getElementById('volumeLevel'),
-    autoPlayResponses: document.getElementById('autoPlayResponses'),
-    showVisualizer: document.getElementById('showVisualizer')
-};
-
-// Visualization variables
-let canvasContext;
-let visualizerBufferLength;
-let visualizerDataArray;
+// Visualizer variables
+let canvasContext = null;
+let visualizerBufferLength = 0;
+let visualizerDataArray = null;
 
 // Initialize the application
 function initializeApp() {
-    // Set up event listeners
-    elements.streamButton.addEventListener('click', toggleStreaming);
-    elements.clearButton.addEventListener('click', clearConversation);
-    elements.thresholdSlider.addEventListener('input', updateThreshold);
-    elements.speakerSelect.addEventListener('change', () => {
-        state.currentSpeaker = parseInt(elements.speakerSelect.value);
-    });
-    elements.showVisualizer.addEventListener('change', toggleVisualizerVisibility);
-
-    // Initialize audio context
-    setupAudioContext();
+    // Initialize the UI elements
+    initializeUIElements();
     
-    // Set up visualization
+    // Initialize socket.io connection
+    setupSocketConnection();
+    
+    // Setup event listeners
+    setupEventListeners();
+    
+    // Initialize visualizer
     setupVisualizer();
     
-    // Connect to Socket.IO server
-    connectToServer();
-
-    // Add welcome message
-    addSystemMessage('Welcome to Sesame AI Voice Chat! Click "Start Conversation" to begin speaking.');
+    // Show welcome message
+    addSystemMessage('Welcome to Sesame AI Voice Chat! Click "Start Conversation" to begin.');
 }
 
-// Connect to Socket.IO server
-function connectToServer() {
-    try {
-        // Use the server URL with or without a specific port
-        const serverUrl = window.location.origin;
+// Initialize UI elements
+function initializeUIElements() {
+    // Main UI containers
+    const chatContainer = document.querySelector('.chat-container');
+    const controlPanel = document.querySelector('.control-panel');
+
+    // Create conversation section
+    chatContainer.innerHTML = `
+        <div class="chat-header">
+            <h2>Conversation</h2>
+            <div class="status-indicator">
+                <div class="status-dot"></div>
+                <span class="status-text">Disconnected</span>
+            </div>
+        </div>
+        <div class="conversation"></div>
+    `;
+
+    // Create control panel
+    controlPanel.innerHTML = `
+        <div class="visualizer-section">
+            <div class="visualizer-container">
+                <canvas id="audioVisualizer"></canvas>
+                <div class="visualizer-label">Speak to see audio visualization</div>
+            </div>
+        </div>
         
-        updateStatus('Connecting...', 'connecting');
-        console.log(`Connecting to Socket.IO server at ${serverUrl}`);
-        
-        state.socket = io(serverUrl, {
-            reconnectionDelay: RECONNECTION_DELAY_MS,
-            reconnectionDelayMax: 5000,
-            reconnectionAttempts: MAX_CONNECTION_ATTEMPTS
-        });
-        
-        setupSocketListeners();
-    } catch (error) {
-        console.error('Error connecting to server:', error);
-        updateStatus('Connection failed. Retrying...', 'error');
-        
-        // Try to reconnect
-        if (state.connectionAttempts < MAX_CONNECTION_ATTEMPTS) {
-            state.connectionAttempts++;
-            setTimeout(connectToServer, RECONNECTION_DELAY_MS);
-        } else {
-            updateStatus('Could not connect to server', 'error');
-            addSystemMessage('Failed to connect to the server. Please check your connection and refresh the page.');
-        }
-    }
+        <div class="controls">
+            <div class="control-group">
+                <div class="control-label">Voice Controls</div>
+                
+                <div class="volume-indicator">
+                    <div class="volume-level" style="width:0%"></div>
+                </div>
+                
+                <div class="slider-container">
+                    <div class="slider-label">
+                        <span>Silence Threshold</span>
+                        <span id="thresholdValue">0.01</span>
+                    </div>
+                    <input type="range" id="thresholdSlider" min="0.001" max="0.05" step="0.001" value="0.01">
+                </div>
+                
+                <select id="speakerSelection">
+                    <option value="0">Speaker 1 (You)</option>
+                    <option value="1">Speaker 2 (Alternative)</option>
+                </select>
+                
+                <div class="button-row">
+                    <button id="streamButton"><i class="fas fa-microphone"></i> Start Conversation</button>
+                    <button id="clearButton"><i class="fas fa-trash"></i> Clear</button>
+                </div>
+            </div>
+            
+            <div class="control-group settings-panel">
+                <div class="control-label">Settings</div>
+                
+                <div class="settings-toggles">
+                    <div class="toggle-switch">
+                        <input type="checkbox" id="autoPlayResponses" checked>
+                        <label for="autoPlayResponses">Auto-play AI responses</label>
+                    </div>
+                    
+                    <div class="toggle-switch">
+                        <input type="checkbox" id="showVisualizer" checked>
+                        <label for="showVisualizer">Show audio visualizer</label>
+                    </div>
+                </div>
+            </div>
+        </div>
+    `;
+
+    // Store references to UI elements
+    elements.conversation = document.querySelector('.conversation');
+    elements.streamButton = document.getElementById('streamButton');
+    elements.clearButton = document.getElementById('clearButton');
+    elements.thresholdSlider = document.getElementById('thresholdSlider');
+    elements.thresholdValue = document.getElementById('thresholdValue');
+    elements.visualizerCanvas = document.getElementById('audioVisualizer');
+    elements.visualizerLabel = document.querySelector('.visualizer-label');
+    elements.volumeLevel = document.querySelector('.volume-level');
+    elements.statusDot = document.querySelector('.status-dot');
+    elements.statusText = document.querySelector('.status-text');
+    elements.speakerSelection = document.getElementById('speakerSelection');
+    elements.autoPlayResponses = document.getElementById('autoPlayResponses');
+    elements.showVisualizer = document.getElementById('showVisualizer');
 }
 
-// Set up Socket.IO event listeners
-function setupSocketListeners() {
-    if (!state.socket) return;
+// Setup Socket.IO connection
+function setupSocketConnection() {
+    state.socket = io(SERVER_URL);
     
+    // Connection events
     state.socket.on('connect', () => {
-        console.log('Connected to Socket.IO server');
-        updateStatus('Connected', 'connected');
-        state.connectionAttempts = 0;
-        elements.streamButton.disabled = false;
-        addSystemMessage('Connected to server');
+        console.log('Connected to server');
+        updateConnectionStatus(true);
     });
     
     state.socket.on('disconnect', () => {
-        console.log('Disconnected from Socket.IO server');
-        updateStatus('Disconnected', 'disconnected');
+        console.log('Disconnected from server');
+        updateConnectionStatus(false);
         
         // Stop streaming if active
         if (state.isStreaming) {
-            stopStreaming(false); // false = don't send to server
+            stopStreaming(false);
         }
-        
-        elements.streamButton.disabled = true;
-        addSystemMessage('Disconnected from server. Trying to reconnect...');
-    });
-    
-    state.socket.on('status', (data) => {
-        console.log('Status:', data);
-        addSystemMessage(data.message);
     });
     
     state.socket.on('error', (data) => {
-        console.error('Server error:', data);
+        console.error('Socket error:', data.message);
         addSystemMessage(`Error: ${data.message}`);
     });
     
+    // Register message handlers
     state.socket.on('audio_response', handleAudioResponse);
     state.socket.on('transcription', handleTranscription);
     state.socket.on('context_updated', handleContextUpdate);
     state.socket.on('streaming_status', handleStreamingStatus);
+}
+
+// Setup event listeners
+function setupEventListeners() {
+    // Stream button
+    elements.streamButton.addEventListener('click', toggleStreaming);
     
-    state.socket.on('connect_error', (error) => {
-        console.error('Connection error:', error);
-        updateStatus('Connection Error', 'error');
+    // Clear button
+    elements.clearButton.addEventListener('click', clearConversation);
+    
+    // Threshold slider
+    elements.thresholdSlider.addEventListener('input', updateThreshold);
+    
+    // Speaker selection
+    elements.speakerSelection.addEventListener('change', () => {
+        state.currentSpeaker = parseInt(elements.speakerSelection.value, 10);
     });
-}
-
-// Update the connection status in the UI
-function updateStatus(message, status) {
-    elements.statusText.textContent = message;
-    elements.statusDot.className = 'status-dot';
     
-    if (status === 'connected') {
-        elements.statusDot.classList.add('active');
-    } else if (status === 'connecting') {
-        elements.statusDot.style.backgroundColor = '#FFA500';
-    } else if (status === 'error') {
-        elements.statusDot.style.backgroundColor = '#F44336';
-    }
+    // Visualizer toggle
+    elements.showVisualizer.addEventListener('change', toggleVisualizerVisibility);
 }
 
-// Set up audio context
-function setupAudioContext() {
-    try {
-        state.audioContext = new (window.AudioContext || window.webkitAudioContext)();
-        console.log('Audio context initialized');
-    } catch (err) {
-        console.error('Error setting up audio context:', err);
-        addSystemMessage(`Audio context error: ${err.message}`);
-        elements.streamButton.disabled = true;
-    }
-}
-
-// Set up audio visualizer
+// Setup audio visualizer
 function setupVisualizer() {
+    if (!elements.visualizerCanvas) return;
+    
     canvasContext = elements.visualizerCanvas.getContext('2d');
     
-    // Set canvas size to match container
-    function resizeCanvas() {
-        const container = elements.visualizerCanvas.parentElement;
-        elements.visualizerCanvas.width = container.clientWidth;
-        elements.visualizerCanvas.height = container.clientHeight;
-    }
+    // Set canvas dimensions
+    elements.visualizerCanvas.width = elements.visualizerCanvas.offsetWidth;
+    elements.visualizerCanvas.height = elements.visualizerCanvas.offsetHeight;
     
-    // Call initially and on window resize
-    resizeCanvas();
-    window.addEventListener('resize', resizeCanvas);
-    
-    // Create placeholder data array
-    visualizerBufferLength = 128;
-    visualizerDataArray = new Uint8Array(visualizerBufferLength);
+    // Initialize the visualizer
+    drawVisualizer();
 }
 
-// Toggle stream on/off
+// Update connection status UI
+function updateConnectionStatus(isConnected) {
+    elements.statusDot.classList.toggle('active', isConnected);
+    elements.statusText.textContent = isConnected ? 'Connected' : 'Disconnected';
+}
+
+// Toggle streaming state
 function toggleStreaming() {
     if (state.isStreaming) {
-        stopStreaming(true); // true = send to server
+        stopStreaming(true);
     } else {
         startStreaming();
     }
 }
 
 // Start streaming audio to the server
-async function startStreaming() {
-    if (!state.socket || !state.socket.connected) {
-        addSystemMessage('Cannot start conversation: Not connected to server');
-        return;
-    }
+function startStreaming() {
+    if (state.isStreaming) return;
     
-    try {
-        // Request microphone access
-        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-        
-        // Update state
-        state.isStreaming = true;
-        state.isSpeaking = false;
-        state.energyWindow = [];
-        state.currentSpeaker = parseInt(elements.speakerSelect.value);
-        
-        // Update UI
-        elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
-        elements.streamButton.classList.add('recording');
-        elements.visualizerLabel.style.opacity = '0';
-        
-        // Set up audio processing
-        setupAudioProcessing(stream);
-        
-        // Start volume meter updates
-        state.volumeUpdateInterval = setInterval(updateVolumeMeter, 100);
-        
-        addSystemMessage('Listening - speak naturally and pause when finished');
-        
-    } catch (err) {
-        console.error('Error starting audio stream:', err);
-        addSystemMessage(`Microphone error: ${err.message}`);
-        cleanupAudioResources();
-    }
-}
-
-// Set up audio processing pipeline
-function setupAudioProcessing(stream) {
-    // Store microphone stream for later cleanup
-    state.microphone = stream;
-    
-    // Create source from microphone
-    const source = state.audioContext.createMediaStreamSource(stream);
-    
-    // Setup analyzer for visualization
-    state.analyser = state.audioContext.createAnalyser();
-    state.analyser.fftSize = 256;
-    state.analyser.smoothingTimeConstant = 0.8;
-    state.analyser.minDecibels = -90;
-    state.analyser.maxDecibels = -10;
-    
-    visualizerBufferLength = state.analyser.frequencyBinCount;
-    visualizerDataArray = new Uint8Array(visualizerBufferLength);
-    
-    // Connect source to analyzer
-    source.connect(state.analyser);
-    
-    // Start visualization
-    if (state.visualizerAnimationFrame) {
-        cancelAnimationFrame(state.visualizerAnimationFrame);
-    }
-    drawVisualizer();
-    
-    // Setup audio processor
-    state.streamProcessor = state.audioContext.createScriptProcessor(4096, 1, 1);
-    
-    // Connect audio nodes
-    source.connect(state.streamProcessor);
-    state.streamProcessor.connect(state.audioContext.destination);
-    
-    // Process audio
-    state.streamProcessor.onaudioprocess = handleAudioProcess;
-}
-
-// Handle each frame of audio data
-function handleAudioProcess(e) {
-    const audioData = e.inputBuffer.getChannelData(0);
-    
-    // Calculate energy (volume) for silence detection
-    const energy = calculateAudioEnergy(audioData);
-    updateEnergyWindow(energy);
-    
-    // Check if currently silent
-    const avgEnergy = calculateAverageEnergy();
-    const isSilent = avgEnergy < state.silenceThreshold;
-    
-    // Handle silence/speech transitions
-    handleSpeechState(isSilent);
-    
-    // Process and send audio
-    const downsampled = downsampleBuffer(audioData, state.audioContext.sampleRate, 24000);
-    sendAudioChunk(downsampled, state.currentSpeaker);
+    // Request microphone access
+    navigator.mediaDevices.getUserMedia({ audio: true, video: false })
+        .then(stream => {
+            // Show processing state while setting up
+            elements.streamButton.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Initializing...';
+            
+            // Create audio context
+            state.audioContext = new (window.AudioContext || window.webkitAudioContext)();
+            
+            // Create microphone source
+            state.microphone = state.audioContext.createMediaStreamSource(stream);
+            
+            // Create analyser for visualizer
+            state.analyser = state.audioContext.createAnalyser();
+            state.analyser.fftSize = 256;
+            visualizerBufferLength = state.analyser.frequencyBinCount;
+            visualizerDataArray = new Uint8Array(visualizerBufferLength);
+            
+            // Connect microphone to analyser
+            state.microphone.connect(state.analyser);
+            
+            // Create script processor for audio processing
+            const bufferSize = 4096;
+            state.streamProcessor = state.audioContext.createScriptProcessor(bufferSize, 1, 1);
+            
+            // Set up audio processing callback
+            state.streamProcessor.onaudioprocess = handleAudioProcess;
+            
+            // Connect the processors
+            state.analyser.connect(state.streamProcessor);
+            state.streamProcessor.connect(state.audioContext.destination);
+            
+            // Update UI
+            state.isStreaming = true;
+            elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
+            elements.streamButton.classList.add('recording');
+            
+            // Initialize energy window
+            state.energyWindow = [];
+            
+            // Start volume meter updates
+            state.volumeUpdateInterval = setInterval(updateVolumeMeter, 100);
+            
+            // Start visualizer if enabled
+            if (elements.showVisualizer.checked && !state.visualizerAnimationFrame) {
+                drawVisualizer();
+            }
+            
+            // Show starting message
+            addSystemMessage('Listening... Speak clearly into your microphone.');
+            
+            // Notify the server that we're starting
+            state.socket.emit('stream_audio', {
+                audio: '',
+                speaker: state.currentSpeaker
+            });
+        })
+        .catch(err => {
+            console.error('Error accessing microphone:', err);
+            addSystemMessage(`Error: ${err.message}. Please make sure your microphone is connected and you've granted permission.`);
+            elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
+        });
 }
 
 // Stop streaming audio
-function stopStreaming(sendToServer = true) {
-    // Cleanup audio resources
-    cleanupAudioResources();
+function stopStreaming(notifyServer = true) {
+    if (!state.isStreaming) return;
     
-    // Reset state
-    state.isStreaming = false;
-    state.isSpeaking = false;
-    state.energyWindow = [];
-    
-    // Update UI
+    // Update UI first
     elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
-    elements.streamButton.classList.remove('recording', 'processing');
-    elements.streamButton.style.backgroundColor = '';
-    elements.volumeLevel.style.width = '100%';
+    elements.streamButton.classList.remove('recording');
+    elements.streamButton.classList.remove('processing');
     
-    // Clear volume meter updates
+    // Stop volume meter updates
     if (state.volumeUpdateInterval) {
         clearInterval(state.volumeUpdateInterval);
         state.volumeUpdateInterval = null;
     }
     
-    addSystemMessage('Conversation paused');
-    
-    // Notify server
-    if (sendToServer && state.socket && state.socket.connected) {
-        state.socket.emit('stop_streaming', {
-            speaker: state.currentSpeaker
-        });
-    }
-}
-
-// Clean up audio processing resources
-function cleanupAudioResources() {
-    // Stop microphone stream
-    if (state.microphone) {
-        state.microphone.getTracks().forEach(track => track.stop());
-        state.microphone = null;
-    }
-    
-    // Disconnect audio processor
+    // Stop all audio processing
     if (state.streamProcessor) {
         state.streamProcessor.disconnect();
-        state.streamProcessor.onaudioprocess = null;
         state.streamProcessor = null;
     }
     
-    // Disconnect analyzer
     if (state.analyser) {
         state.analyser.disconnect();
-        state.analyser = null;
     }
     
-    // Cancel visualizer animation
+    if (state.microphone) {
+        state.microphone.disconnect();
+    }
+    
+    // Close audio context
+    if (state.audioContext && state.audioContext.state !== 'closed') {
+        state.audioContext.close().catch(err => console.warn('Error closing audio context:', err));
+    }
+    
+    // Cleanup animation frames
+    if (state.visualizerAnimationFrame) {
+        cancelAnimationFrame(state.visualizerAnimationFrame);
+        state.visualizerAnimationFrame = null;
+    }
+    
+    // Reset state
+    state.isStreaming = false;
+    state.isSpeaking = false;
+    
+    // Notify the server
+    if (notifyServer && state.socket && state.socket.connected) {
+        state.socket.emit('stop_streaming', {
+            speaker: state.currentSpeaker
+        });
+    }
+    
+    // Show message
+    addSystemMessage('Conversation paused. Click "Start Conversation" to resume.');
+}
+
+// Handle audio processing
+function handleAudioProcess(event) {
+    const inputData = event.inputBuffer.getChannelData(0);
+    
+    // Calculate audio energy (volume level)
+    const energy = calculateAudioEnergy(inputData);
+    
+    // Update energy window for averaging
+    updateEnergyWindow(energy);
+    
+    // Calculate average energy
+    const avgEnergy = calculateAverageEnergy();
+    
+    // Determine if audio is silent
+    const isSilent = avgEnergy < state.silenceThreshold;
+    
+    // Handle speech state based on silence
+    handleSpeechState(isSilent);
+    
+    // Only send audio chunk if we detect speech
+    if (!isSilent) {
+        // Create a resampled version at 24kHz for the server
+        // Most WebRTC audio is 48kHz, but we want 24kHz for the model
+        const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
+        
+        // Send the audio chunk to the server
+        sendAudioChunk(resampledData, state.currentSpeaker);
+    }
+}
+
+// Cleanup audio resources when done
+function cleanupAudioResources() {
+    // Stop all audio processing
+    if (state.streamProcessor) {
+        state.streamProcessor.disconnect();
+        state.streamProcessor = null;
+    }
+    
+    if (state.analyser) {
+        state.analyser.disconnect();
+        state.analyser = null;
+    }
+    
+    if (state.microphone) {
+        state.microphone.disconnect();
+        state.microphone = null;
+    }
+    
+    // Close audio context
+    if (state.audioContext && state.audioContext.state !== 'closed') {
+        state.audioContext.close().catch(err => console.warn('Error closing audio context:', err));
+    }
+    
+    // Cancel all timers and animation frames
+    if (state.volumeUpdateInterval) {
+        clearInterval(state.volumeUpdateInterval);
+        state.volumeUpdateInterval = null;
+    }
+    
     if (state.visualizerAnimationFrame) {
         cancelAnimationFrame(state.visualizerAnimationFrame);
         state.visualizerAnimationFrame = null;
     }
     
-    // Cancel silence timer
     if (state.silenceTimer) {
         clearTimeout(state.silenceTimer);
         state.silenceTimer = null;
     }
-    
-    // Reset visualizer display
-    if (canvasContext) {
-        canvasContext.clearRect(0, 0, elements.visualizerCanvas.width, elements.visualizerCanvas.height);
-        elements.visualizerLabel.style.opacity = '0.7';
-    }
 }
 
 // Clear conversation history
 function clearConversation() {
-    // Clear UI
-    elements.conversation.innerHTML = '';
-    addSystemMessage('Conversation cleared');
-    
-    // Notify server
-    if (state.socket && state.socket.connected) {
-        state.socket.emit('clear_context');
+    if (elements.conversation) {
+        elements.conversation.innerHTML = '';
+        addSystemMessage('Conversation cleared.');
+        
+        // Notify server to clear context
+        if (state.socket && state.socket.connected) {
+            state.socket.emit('clear_context');
+        }
     }
 }
 
@@ -390,9 +447,9 @@ function clearConversation() {
 function calculateAudioEnergy(buffer) {
     let sum = 0;
     for (let i = 0; i < buffer.length; i++) {
-        sum += Math.abs(buffer[i]);
+        sum += buffer[i] * buffer[i];
     }
-    return sum / buffer.length;
+    return Math.sqrt(sum / buffer.length);
 }
 
 // Update energy window for averaging
@@ -406,7 +463,9 @@ function updateEnergyWindow(energy) {
 // Calculate average energy from window
 function calculateAverageEnergy() {
     if (state.energyWindow.length === 0) return 0;
-    return state.energyWindow.reduce((sum, val) => sum + val, 0) / state.energyWindow.length;
+    
+    const sum = state.energyWindow.reduce((a, b) => a + b, 0);
+    return sum / state.energyWindow.length;
 }
 
 // Update the threshold from the slider
@@ -417,32 +476,26 @@ function updateThreshold() {
 
 // Update the volume meter display
 function updateVolumeMeter() {
-    if (!state.isStreaming || !state.analyser) return;
+    if (!state.isStreaming || !state.energyWindow.length) return;
     
-    // Get current volume level
-    const dataArray = new Uint8Array(state.analyser.frequencyBinCount);
-    state.analyser.getByteFrequencyData(dataArray);
+    const avgEnergy = calculateAverageEnergy();
     
-    // Calculate average volume
-    let sum = 0;
-    for (let i = 0; i < dataArray.length; i++) {
-        sum += dataArray[i];
-    }
-    const average = sum / dataArray.length;
+    // Scale energy to percentage (0-100)
+    // Typically, energy values will be very small (e.g., 0.001 to 0.1)
+    // So we multiply by a factor to make it more visible
+    const scaleFactor = 1000;
+    const percentage = Math.min(100, Math.max(0, avgEnergy * scaleFactor));
     
-    // Normalize to 0-100%
-    const percentage = Math.min(100, Math.max(0, average / 128 * 100));
-    
-    // Invert because we're showing the "empty" portion
-    elements.volumeLevel.style.width = (100 - percentage) + '%';
+    // Update volume meter width
+    elements.volumeLevel.style.width = `${percentage}%`;
     
     // Change color based on level
     if (percentage > 70) {
-        elements.volumeLevel.style.backgroundColor = 'rgba(244, 67, 54, 0.5)'; // Red
+        elements.volumeLevel.style.backgroundColor = '#ff5252';
     } else if (percentage > 30) {
-        elements.volumeLevel.style.backgroundColor = 'rgba(255, 235, 59, 0.5)'; // Yellow
+        elements.volumeLevel.style.backgroundColor = '#4CAF50';
     } else {
-        elements.volumeLevel.style.backgroundColor = 'rgba(0, 0, 0, 0.5)'; // Dark
+        elements.volumeLevel.style.backgroundColor = '#4c84ff';
     }
 }
 
@@ -452,31 +505,16 @@ function handleSpeechState(isSilent) {
         // Transition from speaking to silence
         if (!state.silenceTimer) {
             state.silenceTimer = setTimeout(() => {
-                // Silence persisted long enough - process the audio
-                elements.streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
-                elements.streamButton.classList.remove('recording');
-                elements.streamButton.classList.add('processing');
-                addSystemMessage('Detected pause in speech, processing response...');
+                // Only consider it a real silence after a certain duration
+                // This prevents detecting brief pauses as the end of speech
+                state.isSpeaking = false;
+                state.silenceTimer = null;
             }, CLIENT_SILENCE_DURATION_MS);
         }
-    } else if (!state.isSpeaking && !isSilent) {
-        // Transition from silence to speaking
-        state.isSpeaking = true;
-        elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
-        elements.streamButton.classList.add('recording');
-        elements.streamButton.classList.remove('processing');
-        
-        // Clear silence timer
-        if (state.silenceTimer) {
-            clearTimeout(state.silenceTimer);
-            state.silenceTimer = null;
-        }
-    } else if (state.isSpeaking && !isSilent) {
-        // Still speaking, reset silence timer
-        if (state.silenceTimer) {
-            clearTimeout(state.silenceTimer);
-            state.silenceTimer = null;
-        }
+    } else if (state.silenceTimer && !isSilent) {
+        // User started speaking again, cancel the silence timer
+        clearTimeout(state.silenceTimer);
+        state.silenceTimer = null;
     }
     
     // Update speaking state for non-silent audio
@@ -488,7 +526,7 @@ function handleSpeechState(isSilent) {
 // Send audio chunk to server
 function sendAudioChunk(audioData, speaker) {
     if (!state.socket || !state.socket.connected) {
-        console.warn('Cannot send audio: socket not connected');
+        console.warn('Socket not connected');
         return;
     }
     
@@ -498,10 +536,10 @@ function sendAudioChunk(audioData, speaker) {
     reader.onloadend = function() {
         const base64data = reader.result;
         
-        // Send to server using Socket.IO
+        // Send the audio chunk to the server
         state.socket.emit('stream_audio', {
-            speaker: speaker,
-            audio: base64data
+            audio: base64data,
+            speaker: speaker
         });
     };
     
@@ -531,7 +569,7 @@ function drawVisualizer() {
         try {
             state.analyser.getByteFrequencyData(visualizerDataArray);
         } catch (e) {
-            console.error("Error getting frequency data:", e);
+            console.warn('Error getting frequency data:', e);
         }
     } else {
         // Fade out when not streaming

From 6a8cc50dac2eb99eb8095e13e261f846bfd7612f Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 23:28:44 -0400
Subject: [PATCH 09/16] serve voice chat js

---
 Backend/server.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Backend/server.py b/Backend/server.py
index e986606..4e60aa7 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -160,6 +160,10 @@ def favicon():
         return send_from_directory(static_dir, 'favicon.ico')
     return Response(status=204)
 
+@app.route('/voice-chat.js')
+def voice_chat_js():
+    return send_from_directory(base_dir, 'voice-chat.js')
+
 @app.route('/static/<path:path>')
 def serve_static(path):
     return send_from_directory(static_dir, path)

From b74ae2dbfc449913e669e2c54e76e973ad63eb6f Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 23:43:16 -0400
Subject: [PATCH 10/16] Demo Update 3

---
 Backend/server.py     |  62 ++++++++--
 Backend/voice-chat.js | 275 +++++++++++++++++++++---------------------
 2 files changed, 188 insertions(+), 149 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index 4e60aa7..bacf793 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -55,27 +55,71 @@ active_clients = {}  # Map client_id to client context
 def decode_audio_data(audio_data: str) -> torch.Tensor:
     """Decode base64 audio data to a torch tensor"""
     try:
+        # Skip empty audio data
+        if not audio_data:
+            print("Empty audio data received")
+            return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence
+            
         # Extract the actual base64 content
         if ',' in audio_data:
             audio_data = audio_data.split(',')[1]
-        
+            
         # Decode base64 audio data
-        binary_data = base64.b64decode(audio_data)
+        try:
+            binary_data = base64.b64decode(audio_data)
+            print(f"Decoded base64 data: {len(binary_data)} bytes")
+        except Exception as e:
+            print(f"Base64 decoding error: {str(e)}")
+            return torch.zeros(generator.sample_rate // 2)
         
+        # Debug: save the raw binary data to examine with external tools
+        debug_path = os.path.join(base_dir, "debug_incoming.wav") 
+        with open(debug_path, 'wb') as f:
+            f.write(binary_data)
+        print(f"Saved debug file to {debug_path}")
+            
         # Load audio from binary data
-        with BytesIO(binary_data) as temp_file:
-            audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
+        try:
+            with BytesIO(binary_data) as temp_file:
+                audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
+                print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
+                
+                # Check if audio is valid
+                if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
+                    print("Warning: Empty or invalid audio data detected")
+                    return torch.zeros(generator.sample_rate // 2)
+        except Exception as e:
+            print(f"Audio loading error: {str(e)}")
+            # Try saving to a temporary file instead of loading from BytesIO
+            try:
+                temp_path = os.path.join(base_dir, "temp_incoming.wav")
+                with open(temp_path, 'wb') as f:
+                    f.write(binary_data)
+                print(f"Trying to load from file: {temp_path}")
+                audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
+                print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
+                os.remove(temp_path)
+            except Exception as e2:
+                print(f"Secondary audio loading error: {str(e2)}")
+                return torch.zeros(generator.sample_rate // 2)
         
         # Resample if needed
         if sample_rate != generator.sample_rate:
-            audio_tensor = torchaudio.functional.resample(
-                audio_tensor.squeeze(0), 
-                orig_freq=sample_rate, 
-                new_freq=generator.sample_rate
-            )
+            try:
+                print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
+                audio_tensor = torchaudio.functional.resample(
+                    audio_tensor.squeeze(0), 
+                    orig_freq=sample_rate, 
+                    new_freq=generator.sample_rate
+                )
+                print(f"Resampled audio shape: {audio_tensor.shape}")
+            except Exception as e:
+                print(f"Resampling error: {str(e)}")
+                return torch.zeros(generator.sample_rate // 2)
         else:
             audio_tensor = audio_tensor.squeeze(0)
             
+        print(f"Final audio tensor shape: {audio_tensor.shape}")
         return audio_tensor
     except Exception as e:
         print(f"Error decoding audio: {str(e)}")
diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js
index a4e10f5..c85da8a 100644
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -70,88 +70,18 @@ function initializeApp() {
 
 // Initialize UI elements
 function initializeUIElements() {
-    // Main UI containers
-    const chatContainer = document.querySelector('.chat-container');
-    const controlPanel = document.querySelector('.control-panel');
-
-    // Create conversation section
-    chatContainer.innerHTML = `
-        <div class="chat-header">
-            <h2>Conversation</h2>
-            <div class="status-indicator">
-                <div class="status-dot"></div>
-                <span class="status-text">Disconnected</span>
-            </div>
-        </div>
-        <div class="conversation"></div>
-    `;
-
-    // Create control panel
-    controlPanel.innerHTML = `
-        <div class="visualizer-section">
-            <div class="visualizer-container">
-                <canvas id="audioVisualizer"></canvas>
-                <div class="visualizer-label">Speak to see audio visualization</div>
-            </div>
-        </div>
-        
-        <div class="controls">
-            <div class="control-group">
-                <div class="control-label">Voice Controls</div>
-                
-                <div class="volume-indicator">
-                    <div class="volume-level" style="width:0%"></div>
-                </div>
-                
-                <div class="slider-container">
-                    <div class="slider-label">
-                        <span>Silence Threshold</span>
-                        <span id="thresholdValue">0.01</span>
-                    </div>
-                    <input type="range" id="thresholdSlider" min="0.001" max="0.05" step="0.001" value="0.01">
-                </div>
-                
-                <select id="speakerSelection">
-                    <option value="0">Speaker 1 (You)</option>
-                    <option value="1">Speaker 2 (Alternative)</option>
-                </select>
-                
-                <div class="button-row">
-                    <button id="streamButton"><i class="fas fa-microphone"></i> Start Conversation</button>
-                    <button id="clearButton"><i class="fas fa-trash"></i> Clear</button>
-                </div>
-            </div>
-            
-            <div class="control-group settings-panel">
-                <div class="control-label">Settings</div>
-                
-                <div class="settings-toggles">
-                    <div class="toggle-switch">
-                        <input type="checkbox" id="autoPlayResponses" checked>
-                        <label for="autoPlayResponses">Auto-play AI responses</label>
-                    </div>
-                    
-                    <div class="toggle-switch">
-                        <input type="checkbox" id="showVisualizer" checked>
-                        <label for="showVisualizer">Show audio visualizer</label>
-                    </div>
-                </div>
-            </div>
-        </div>
-    `;
-
     // Store references to UI elements
-    elements.conversation = document.querySelector('.conversation');
+    elements.conversation = document.getElementById('conversation');
     elements.streamButton = document.getElementById('streamButton');
     elements.clearButton = document.getElementById('clearButton');
     elements.thresholdSlider = document.getElementById('thresholdSlider');
     elements.thresholdValue = document.getElementById('thresholdValue');
     elements.visualizerCanvas = document.getElementById('audioVisualizer');
-    elements.visualizerLabel = document.querySelector('.visualizer-label');
-    elements.volumeLevel = document.querySelector('.volume-level');
-    elements.statusDot = document.querySelector('.status-dot');
-    elements.statusText = document.querySelector('.status-text');
-    elements.speakerSelection = document.getElementById('speakerSelection');
+    elements.visualizerLabel = document.getElementById('visualizerLabel');
+    elements.volumeLevel = document.getElementById('volumeLevel');
+    elements.statusDot = document.getElementById('statusDot');
+    elements.statusText = document.getElementById('statusText');
+    elements.speakerSelection = document.getElementById('speakerSelect'); // Changed to match HTML
     elements.autoPlayResponses = document.getElementById('autoPlayResponses');
     elements.showVisualizer = document.getElementById('showVisualizer');
 }
@@ -364,8 +294,12 @@ function stopStreaming(notifyServer = true) {
 function handleAudioProcess(event) {
     const inputData = event.inputBuffer.getChannelData(0);
     
+    // Log audio buffer statistics
+    console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`);
+    
     // Calculate audio energy (volume level)
     const energy = calculateAudioEnergy(inputData);
+    console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`);
     
     // Update energy window for averaging
     updateEnergyWindow(energy);
@@ -375,6 +309,7 @@ function handleAudioProcess(event) {
     
     // Determine if audio is silent
     const isSilent = avgEnergy < state.silenceThreshold;
+    console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`);
     
     // Handle speech state based on silence
     handleSpeechState(isSilent);
@@ -384,6 +319,7 @@ function handleAudioProcess(event) {
         // Create a resampled version at 24kHz for the server
         // Most WebRTC audio is 48kHz, but we want 24kHz for the model
         const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
+        console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`);
         
         // Send the audio chunk to the server
         sendAudioChunk(resampledData, state.currentSpeaker);
@@ -530,20 +466,132 @@ function sendAudioChunk(audioData, speaker) {
         return;
     }
     
-    const wavData = createWavBlob(audioData, 24000);
-    const reader = new FileReader();
+    console.log(`Creating WAV from audio data: length=${audioData.length}`);
     
-    reader.onloadend = function() {
-        const base64data = reader.result;
+    // Check for NaN or invalid values
+    let hasNaN = false;
+    let min = Infinity;
+    let max = -Infinity;
+    let sum = 0;
+    
+    for (let i = 0; i < audioData.length; i++) {
+        if (isNaN(audioData[i]) || !isFinite(audioData[i])) {
+            hasNaN = true;
+            console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`);
+            break;
+        }
+        min = Math.min(min, audioData[i]);
+        max = Math.max(max, audioData[i]);
+        sum += audioData[i];
+    }
+    
+    if (hasNaN) {
+        console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.');
+        audioData = new Float32Array(audioData.length).fill(0);
+    } else {
+        const avg = sum / audioData.length;
+        console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`);
+    }
+    
+    try {
+        // Create WAV blob with proper format
+        const wavData = createWavBlob(audioData, 24000);
+        console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`);
         
-        // Send the audio chunk to the server
-        state.socket.emit('stream_audio', {
-            audio: base64data,
-            speaker: speaker
-        });
-    };
+        const reader = new FileReader();
+        
+        reader.onloadend = function() {
+            try {
+                // Get base64 data
+                const base64data = reader.result;
+                console.log(`Base64 data created: length=${base64data.length}`);
+                
+                // Validate the base64 data before sending
+                if (!base64data || base64data.length < 100) {
+                    console.warn('Generated base64 data is too small or invalid');
+                    return;
+                }
+                
+                // Send the audio chunk to the server
+                console.log('Sending audio data to server...');
+                state.socket.emit('stream_audio', {
+                    audio: base64data,
+                    speaker: speaker
+                });
+                console.log('Audio data sent successfully');
+            } catch (err) {
+                console.error('Error preparing audio data:', err);
+            }
+        };
+        
+        reader.onerror = function(err) {
+            console.error('Error reading audio data:', err);
+        };
+        
+        reader.readAsDataURL(wavData);
+    } catch (err) {
+        console.error('Error creating WAV data:', err);
+    }
+}
+
+// Create WAV blob from audio data with validation
+function createWavBlob(audioData, sampleRate) {
+    // Check if audio data is valid
+    if (!audioData || audioData.length === 0) {
+        console.warn('Empty audio data received');
+        // Return a tiny silent audio snippet instead
+        audioData = new Float32Array(100).fill(0);
+    }
     
-    reader.readAsDataURL(wavData);
+    // Function to convert Float32Array to Int16Array for WAV format
+    function floatTo16BitPCM(output, offset, input) {
+        for (let i = 0; i < input.length; i++, offset += 2) {
+            const s = Math.max(-1, Math.min(1, input[i]));
+            output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
+        }
+    }
+    
+    // Create WAV header
+    function writeString(view, offset, string) {
+        for (let i = 0; i < string.length; i++) {
+            view.setUint8(offset + i, string.charCodeAt(i));
+        }
+    }
+    
+    // Create WAV file with header
+    function encodeWAV(samples) {
+        const buffer = new ArrayBuffer(44 + samples.length * 2);
+        const view = new DataView(buffer);
+        
+        // RIFF chunk descriptor
+        writeString(view, 0, 'RIFF');
+        view.setUint32(4, 36 + samples.length * 2, true);
+        writeString(view, 8, 'WAVE');
+        
+        // fmt sub-chunk
+        writeString(view, 12, 'fmt ');
+        view.setUint32(16, 16, true);
+        view.setUint16(20, 1, true); // PCM format
+        view.setUint16(22, 1, true); // Mono channel
+        view.setUint32(24, sampleRate, true);
+        view.setUint32(28, sampleRate * 2, true); // Byte rate
+        view.setUint16(32, 2, true); // Block align
+        view.setUint16(34, 16, true); // Bits per sample
+        
+        // data sub-chunk
+        writeString(view, 36, 'data');
+        view.setUint32(40, samples.length * 2, true);
+        floatTo16BitPCM(view, 44, samples);
+        
+        return buffer;
+    }
+    
+    // Convert audio data to TypedArray if it's a regular Array
+    const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
+    
+    // Create WAV blob
+    const wavBuffer = encodeWAV(samples);
+    return new Blob([wavBuffer], { type: 'audio/wav' });
 }
 
 // Draw audio visualizer
@@ -757,59 +805,6 @@ function addSystemMessage(message) {
     elements.conversation.scrollTop = elements.conversation.scrollHeight;
 }
 
-// Create WAV blob from audio data
-function createWavBlob(audioData, sampleRate) {
-    // Function to convert Float32Array to Int16Array for WAV format
-    function floatTo16BitPCM(output, offset, input) {
-        for (let i = 0; i < input.length; i++, offset += 2) {
-            const s = Math.max(-1, Math.min(1, input[i]));
-            output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
-        }
-    }
-    
-    // Create WAV header
-    function writeString(view, offset, string) {
-        for (let i = 0; i < string.length; i++) {
-            view.setUint8(offset + i, string.charCodeAt(i));
-        }
-    }
-    
-    // Create WAV file with header
-    function encodeWAV(samples) {
-        const buffer = new ArrayBuffer(44 + samples.length * 2);
-        const view = new DataView(buffer);
-        
-        // RIFF chunk descriptor
-        writeString(view, 0, 'RIFF');
-        view.setUint32(4, 36 + samples.length * 2, true);
-        writeString(view, 8, 'WAVE');
-        
-        // fmt sub-chunk
-        writeString(view, 12, 'fmt ');
-        view.setUint32(16, 16, true);
-        view.setUint16(20, 1, true); // PCM format
-        view.setUint16(22, 1, true); // Mono channel
-        view.setUint32(24, sampleRate, true);
-        view.setUint32(28, sampleRate * 2, true); // Byte rate
-        view.setUint16(32, 2, true); // Block align
-        view.setUint16(34, 16, true); // Bits per sample
-        
-        // data sub-chunk
-        writeString(view, 36, 'data');
-        view.setUint32(40, samples.length * 2, true);
-        floatTo16BitPCM(view, 44, samples);
-        
-        return buffer;
-    }
-    
-    // Convert audio data to TypedArray if it's a regular Array
-    const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
-    
-    // Create WAV blob
-    const wavBuffer = encodeWAV(samples);
-    return new Blob([wavBuffer], { type: 'audio/wav' });
-}
-
 // Downsample audio buffer to target sample rate
 function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
     if (originalSampleRate === targetSampleRate) {

From eef7da454a082220b6d106558baf1f36f69aac73 Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sat, 29 Mar 2025 23:54:02 -0400
Subject: [PATCH 11/16] Demo Update 3

---
 Backend/server.py     | 296 ++++++++++++++++++++++++++++++++++--------
 Backend/voice-chat.js | 136 +++++++++++--------
 2 files changed, 320 insertions(+), 112 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index bacf793..b638e99 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -16,6 +16,28 @@ import gc
 from collections import deque
 from threading import Lock
 
+# Add these lines right after your imports
+import torch
+import os
+
+# Handle CUDA issues
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Limit to first GPU only
+torch.backends.cudnn.benchmark = True
+
+# Set CUDA settings to avoid TF32 warnings
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+# Set compute type based on available hardware
+if torch.cuda.is_available():
+    device = "cuda"
+    compute_type = "float16"  # Faster for CUDA
+else:
+    device = "cpu"
+    compute_type = "int8"  # Better for CPU
+
+print(f"Using device: {device} with compute type: {compute_type}")
+
 # Select device
 if torch.cuda.is_available():
     device = "cuda"
@@ -28,9 +50,22 @@ generator = load_csm_1b(device=device)
 
 # Initialize WhisperX for ASR
 print("Loading WhisperX model...")
-# Use a smaller model for faster response times
-asr_model = whisperx.load_model("medium", device, compute_type="float16")
-print("WhisperX model loaded!")
+try:
+    # Try to load a smaller model for faster response times
+    asr_model = whisperx.load_model("small", device, compute_type=compute_type)
+    print("WhisperX 'small' model loaded successfully")
+except Exception as e:
+    print(f"Error loading 'small' model: {str(e)}")
+    try:
+        # Fall back to tiny model if small fails
+        asr_model = whisperx.load_model("tiny", device, compute_type=compute_type)
+        print("WhisperX 'tiny' model loaded as fallback")
+    except Exception as e2:
+        print(f"Error loading fallback model: {str(e2)}")
+        print("Trying CPU model as last resort")
+        # Last resort - try CPU
+        asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
+        print("WhisperX loaded on CPU as last resort")
 
 # Silence detection parameters
 SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
@@ -53,76 +88,130 @@ active_clients = {}  # Map client_id to client context
 
 # Helper function to convert audio data
 def decode_audio_data(audio_data: str) -> torch.Tensor:
-    """Decode base64 audio data to a torch tensor"""
+    """Decode base64 audio data to a torch tensor with improved error handling"""
     try:
         # Skip empty audio data
-        if not audio_data:
-            print("Empty audio data received")
+        if not audio_data or len(audio_data) < 100:
+            print("Empty or too short audio data received")
             return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence
             
         # Extract the actual base64 content
         if ',' in audio_data:
+            # Handle data URL format (data:audio/wav;base64,...)
             audio_data = audio_data.split(',')[1]
             
         # Decode base64 audio data
         try:
             binary_data = base64.b64decode(audio_data)
             print(f"Decoded base64 data: {len(binary_data)} bytes")
+            
+            # Check if we have enough data for a valid WAV
+            if len(binary_data) < 44:  # WAV header is 44 bytes
+                print("Data too small to be a valid WAV file")
+                return torch.zeros(generator.sample_rate // 2)
         except Exception as e:
             print(f"Base64 decoding error: {str(e)}")
             return torch.zeros(generator.sample_rate // 2)
         
-        # Debug: save the raw binary data to examine with external tools
+        # Save for debugging
         debug_path = os.path.join(base_dir, "debug_incoming.wav") 
         with open(debug_path, 'wb') as f:
             f.write(binary_data)
-        print(f"Saved debug file to {debug_path}")
-            
-        # Load audio from binary data
+        print(f"Saved debug file: {debug_path}")
+        
+        # Approach 1: Load directly with torchaudio
         try:
             with BytesIO(binary_data) as temp_file:
+                temp_file.seek(0)  # Ensure we're at the start of the buffer
                 audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
-                print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
+                print(f"Direct loading success: shape={audio_tensor.shape}, rate={sample_rate}Hz")
                 
                 # Check if audio is valid
                 if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
-                    print("Warning: Empty or invalid audio data detected")
-                    return torch.zeros(generator.sample_rate // 2)
+                    raise ValueError("Empty or invalid audio tensor detected")
         except Exception as e:
-            print(f"Audio loading error: {str(e)}")
-            # Try saving to a temporary file instead of loading from BytesIO
+            print(f"Direct loading failed: {str(e)}")
+            
+            # Approach 2: Try to fix/normalize the WAV data
             try:
-                temp_path = os.path.join(base_dir, "temp_incoming.wav")
+                # Sometimes WAV headers can be malformed, attempt to fix
+                temp_path = os.path.join(base_dir, "temp_fixing.wav")
                 with open(temp_path, 'wb') as f:
                     f.write(binary_data)
-                print(f"Trying to load from file: {temp_path}")
-                audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
-                print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
-                os.remove(temp_path)
+                
+                # Use a simpler numpy approach as backup
+                import numpy as np
+                import wave
+                
+                try:
+                    with wave.open(temp_path, 'rb') as wf:
+                        n_channels = wf.getnchannels()
+                        sample_width = wf.getsampwidth()
+                        sample_rate = wf.getframerate()
+                        n_frames = wf.getnframes()
+                        
+                        # Read the frames
+                        frames = wf.readframes(n_frames)
+                        print(f"Wave reading: channels={n_channels}, rate={sample_rate}Hz, frames={n_frames}")
+                        
+                        # Convert to numpy and then to torch
+                        if sample_width == 2:  # 16-bit audio
+                            data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
+                        elif sample_width == 1:  # 8-bit audio
+                            data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0
+                        else:
+                            raise ValueError(f"Unsupported sample width: {sample_width}")
+                        
+                        # Convert to mono if needed
+                        if n_channels > 1:
+                            data = data.reshape(-1, n_channels)
+                            data = data.mean(axis=1)
+                        
+                        # Convert to torch tensor
+                        audio_tensor = torch.from_numpy(data)
+                        print(f"Successfully converted with numpy: shape={audio_tensor.shape}")
+                except Exception as wave_error:
+                    print(f"Wave processing failed: {str(wave_error)}")
+                    # Try with torchaudio as last resort
+                    audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
+                
+                # Clean up
+                if os.path.exists(temp_path):
+                    os.remove(temp_path)
             except Exception as e2:
-                print(f"Secondary audio loading error: {str(e2)}")
+                print(f"All WAV loading methods failed: {str(e2)}")
+                print("Returning silence as fallback")
                 return torch.zeros(generator.sample_rate // 2)
         
+        # Ensure audio is the right shape (mono)
+        if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1:
+            audio_tensor = torch.mean(audio_tensor, dim=0)
+        
+        # Ensure we have a 1D tensor
+        audio_tensor = audio_tensor.squeeze()
+            
         # Resample if needed
         if sample_rate != generator.sample_rate:
             try:
                 print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
-                audio_tensor = torchaudio.functional.resample(
-                    audio_tensor.squeeze(0), 
+                resampler = torchaudio.transforms.Resample(
                     orig_freq=sample_rate, 
                     new_freq=generator.sample_rate
                 )
-                print(f"Resampled audio shape: {audio_tensor.shape}")
+                audio_tensor = resampler(audio_tensor)
             except Exception as e:
                 print(f"Resampling error: {str(e)}")
-                return torch.zeros(generator.sample_rate // 2)
-        else:
-            audio_tensor = audio_tensor.squeeze(0)
-            
-        print(f"Final audio tensor shape: {audio_tensor.shape}")
+                # If resampling fails, just return the original audio
+                # The model can often handle different sample rates
+        
+        # Normalize audio to avoid issues
+        if torch.abs(audio_tensor).max() > 0:
+            audio_tensor = audio_tensor / torch.abs(audio_tensor).max()
+        
+        print(f"Final audio tensor: shape={audio_tensor.shape}, min={audio_tensor.min().item():.4f}, max={audio_tensor.max().item():.4f}")
         return audio_tensor
     except Exception as e:
-        print(f"Error decoding audio: {str(e)}")
+        print(f"Unhandled error in decode_audio_data: {str(e)}")
         # Return a small silent audio segment as fallback
         return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence
 
@@ -143,6 +232,8 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
         temp_path = os.path.join(base_dir, "temp_audio.wav")
         torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
         
+        print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)")
+        
         # Load and transcribe the audio
         audio = whisperx.load_audio(temp_path)
         result = asr_model.transcribe(audio, batch_size=16)
@@ -155,11 +246,15 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
         if result["segments"] and len(result["segments"]) > 0:
             # Combine all segments
             transcription = " ".join([segment["text"] for segment in result["segments"]])
+            print(f"Transcription successful: '{transcription.strip()}'")
             return transcription.strip()
         else:
+            print("Transcription returned no segments")
             return ""
     except Exception as e:
         print(f"Error in transcription: {str(e)}")
+        import traceback
+        traceback.print_exc()
         if os.path.exists("temp_audio.wav"):
             os.remove("temp_audio.wav")
         return ""
@@ -385,43 +480,73 @@ def handle_stream_audio(data):
             # Log the transcription
             print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
             
-            # Add to conversation context
+            # Handle the transcription result
             if transcribed_text:
+                # Add user message to context
                 user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
                 client['context_segments'].append(user_segment)
                 
-                # Generate a contextual response
-                response_text = generate_response(transcribed_text, client['context_segments'])
-                
                 # Send the transcribed text to client
                 emit('transcription', {
                     'type': 'transcription',
                     'text': transcribed_text
                 })
                 
-                # Generate audio for the response
-                audio_tensor = generator.generate(
-                    text=response_text,
-                    speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
-                    context=client['context_segments'],
-                    max_audio_length_ms=10_000,
-                )
+                # Generate a contextual response
+                response_text = generate_response(transcribed_text, client['context_segments'])
+                print(f"[{client_id}] Generating audio response: '{response_text}'")
                 
-                # Add response to context
-                ai_segment = Segment(
-                    text=response_text, 
-                    speaker=1 if speaker_id == 0 else 0, 
-                    audio=audio_tensor
-                )
-                client['context_segments'].append(ai_segment)
-                
-                # Convert audio to base64 and send back to client
-                audio_base64 = encode_audio_data(audio_tensor)
-                emit('audio_response', {
-                    'type': 'audio_response',
-                    'text': response_text,
-                    'audio': audio_base64
+                # Let the client know we're processing
+                emit('processing_status', {
+                    'type': 'processing_status',
+                    'status': 'generating_audio',
+                    'message': 'Generating audio response...'
                 })
+                
+                # Generate audio for the response
+                try:
+                    # Use a different speaker than the user
+                    ai_speaker_id = 1 if speaker_id == 0 else 0
+                    
+                    # Start audio generation with streaming (chunk by chunk)
+                    audio_chunks = []
+                    
+                    # This version tries to stream the audio generation in smaller chunks
+                    # Note: CSM model doesn't natively support incremental generation,
+                    # so we're simulating it here for a more responsive UI experience
+                    
+                    # Generate the full response
+                    audio_tensor = generator.generate(
+                        text=response_text,
+                        speaker=ai_speaker_id,
+                        context=client['context_segments'],
+                        max_audio_length_ms=10_000,
+                    )
+                    
+                    # Add response to context
+                    ai_segment = Segment(
+                        text=response_text, 
+                        speaker=ai_speaker_id, 
+                        audio=audio_tensor
+                    )
+                    client['context_segments'].append(ai_segment)
+                    
+                    # Convert audio to base64 and send back to client
+                    audio_base64 = encode_audio_data(audio_tensor)
+                    emit('audio_response', {
+                        'type': 'audio_response',
+                        'text': response_text,
+                        'audio': audio_base64
+                    })
+                    
+                    print(f"[{client_id}] Audio response sent: {len(audio_base64)} bytes")
+                    
+                except Exception as gen_error:
+                    print(f"Error generating audio response: {str(gen_error)}")
+                    emit('error', {
+                        'type': 'error',
+                        'message': "Sorry, there was an error generating the audio response."
+                    })
             else:
                 # If transcription failed, send a generic response
                 emit('error', {
@@ -437,6 +562,7 @@ def handle_stream_audio(data):
         
         # If buffer gets too large without silence, process it anyway
         elif len(client['streaming_buffer']) >= 30:  # ~6 seconds of audio at 5 chunks/sec
+            print(f"[{client_id}] Processing long audio segment without silence")
             full_audio = torch.cat(client['streaming_buffer'], dim=0)
             
             # Process with WhisperX speech-to-text
@@ -453,7 +579,9 @@ def handle_stream_audio(data):
                     'text': transcribed_text + " (processing continued speech...)"
                 })
             
-            client['streaming_buffer'] = []
+            # Keep half of the buffer for context (sliding window approach)
+            half_point = len(client['streaming_buffer']) // 2
+            client['streaming_buffer'] = client['streaming_buffer'][half_point:]
             
     except Exception as e:
         import traceback
@@ -497,6 +625,62 @@ def handle_stop_streaming(data):
         'status': 'stopped'
     })
 
+def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=500):
+    """Stream audio to client in chunks to simulate real-time generation"""
+    try:
+        if client_id not in active_clients:
+            print(f"Client {client_id} not found for streaming")
+            return
+            
+        # Calculate chunk size in samples
+        chunk_size = int(generator.sample_rate * chunk_size_ms / 1000)
+        total_chunks = math.ceil(audio_tensor.size(0) / chunk_size)
+        
+        print(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each")
+        
+        # Send initial response with text but no audio yet
+        socketio.emit('audio_response_start', {
+            'type': 'audio_response_start',
+            'text': text,
+            'total_chunks': total_chunks
+        }, room=client_id)
+        
+        # Stream each chunk
+        for i in range(total_chunks):
+            start_idx = i * chunk_size
+            end_idx = min(start_idx + chunk_size, audio_tensor.size(0))
+            
+            # Extract chunk
+            chunk = audio_tensor[start_idx:end_idx]
+            
+            # Encode chunk
+            chunk_base64 = encode_audio_data(chunk)
+            
+            # Send chunk
+            socketio.emit('audio_response_chunk', {
+                'type': 'audio_response_chunk',
+                'chunk_index': i,
+                'total_chunks': total_chunks,
+                'audio': chunk_base64,
+                'is_last': i == total_chunks - 1
+            }, room=client_id)
+            
+            # Brief pause between chunks to simulate streaming
+            time.sleep(0.1)
+            
+        # Send completion message
+        socketio.emit('audio_response_complete', {
+            'type': 'audio_response_complete',
+            'text': text
+        }, room=client_id)
+        
+        print(f"Audio streaming complete: {total_chunks} chunks sent")
+        
+    except Exception as e:
+        print(f"Error streaming audio to client: {str(e)}")
+        import traceback
+        traceback.print_exc()
+
 if __name__ == "__main__":
     print(f"\n{'='*60}")
     print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js
index c85da8a..b224b27 100644
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -466,37 +466,27 @@ function sendAudioChunk(audioData, speaker) {
         return;
     }
     
-    console.log(`Creating WAV from audio data: length=${audioData.length}`);
+    console.log(`Preparing audio chunk: length=${audioData.length}, speaker=${speaker}`);
     
     // Check for NaN or invalid values
-    let hasNaN = false;
-    let min = Infinity;
-    let max = -Infinity;
-    let sum = 0;
-    
+    let hasInvalidValues = false;
     for (let i = 0; i < audioData.length; i++) {
         if (isNaN(audioData[i]) || !isFinite(audioData[i])) {
-            hasNaN = true;
+            hasInvalidValues = true;
             console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`);
             break;
         }
-        min = Math.min(min, audioData[i]);
-        max = Math.max(max, audioData[i]);
-        sum += audioData[i];
     }
     
-    if (hasNaN) {
-        console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.');
+    if (hasInvalidValues) {
+        console.warn('Audio data contains invalid values. Creating silent audio.');
         audioData = new Float32Array(audioData.length).fill(0);
-    } else {
-        const avg = sum / audioData.length;
-        console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`);
     }
     
     try {
-        // Create WAV blob with proper format
+        // Create WAV blob
         const wavData = createWavBlob(audioData, 24000);
-        console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`);
+        console.log(`WAV blob created: ${wavData.size} bytes`);
         
         const reader = new FileReader();
         
@@ -504,28 +494,21 @@ function sendAudioChunk(audioData, speaker) {
             try {
                 // Get base64 data
                 const base64data = reader.result;
-                console.log(`Base64 data created: length=${base64data.length}`);
+                console.log(`Base64 data created: ${base64data.length} bytes`);
                 
-                // Validate the base64 data before sending
-                if (!base64data || base64data.length < 100) {
-                    console.warn('Generated base64 data is too small or invalid');
-                    return;
-                }
-                
-                // Send the audio chunk to the server
-                console.log('Sending audio data to server...');
+                // Send to server
                 state.socket.emit('stream_audio', {
                     audio: base64data,
                     speaker: speaker
                 });
-                console.log('Audio data sent successfully');
+                console.log('Audio chunk sent to server');
             } catch (err) {
                 console.error('Error preparing audio data:', err);
             }
         };
         
-        reader.onerror = function(err) {
-            console.error('Error reading audio data:', err);
+        reader.onerror = function() {
+            console.error('Error reading audio data as base64');
         };
         
         reader.readAsDataURL(wavData);
@@ -534,19 +517,20 @@ function sendAudioChunk(audioData, speaker) {
     }
 }
 
-// Create WAV blob from audio data with validation
+// Create WAV blob from audio data with improved error handling
 function createWavBlob(audioData, sampleRate) {
-    // Check if audio data is valid
+    // Validate input
     if (!audioData || audioData.length === 0) {
-        console.warn('Empty audio data received');
-        // Return a tiny silent audio snippet instead
-        audioData = new Float32Array(100).fill(0);
+        console.warn('Empty audio data provided to createWavBlob');
+        audioData = new Float32Array(1024).fill(0); // Create 1024 samples of silence
     }
     
     // Function to convert Float32Array to Int16Array for WAV format
     function floatTo16BitPCM(output, offset, input) {
         for (let i = 0; i < input.length; i++, offset += 2) {
+            // Ensure values are in -1 to 1 range
             const s = Math.max(-1, Math.min(1, input[i]));
+            // Convert to 16-bit PCM
             output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
         }
     }
@@ -558,40 +542,80 @@ function createWavBlob(audioData, sampleRate) {
         }
     }
     
-    // Create WAV file with header
-    function encodeWAV(samples) {
-        const buffer = new ArrayBuffer(44 + samples.length * 2);
+    try {
+        // Create WAV file with header - careful with buffer sizes
+        const buffer = new ArrayBuffer(44 + audioData.length * 2);
         const view = new DataView(buffer);
         
-        // RIFF chunk descriptor
+        // RIFF identifier
         writeString(view, 0, 'RIFF');
-        view.setUint32(4, 36 + samples.length * 2, true);
+        
+        // File length (will be filled later)
+        view.setUint32(4, 36 + audioData.length * 2, true);
+        
+        // WAVE identifier
         writeString(view, 8, 'WAVE');
         
-        // fmt sub-chunk
+        // fmt chunk identifier
         writeString(view, 12, 'fmt ');
+        
+        // fmt chunk length
         view.setUint32(16, 16, true);
-        view.setUint16(20, 1, true); // PCM format
-        view.setUint16(22, 1, true); // Mono channel
+        
+        // Sample format (1 is PCM)
+        view.setUint16(20, 1, true);
+        
+        // Mono channel
+        view.setUint16(22, 1, true);
+        
+        // Sample rate
         view.setUint32(24, sampleRate, true);
-        view.setUint32(28, sampleRate * 2, true); // Byte rate
-        view.setUint16(32, 2, true); // Block align
-        view.setUint16(34, 16, true); // Bits per sample
         
-        // data sub-chunk
+        // Byte rate (sample rate * block align)
+        view.setUint32(28, sampleRate * 2, true);
+        
+        // Block align (channels * bytes per sample)
+        view.setUint16(32, 2, true);
+        
+        // Bits per sample
+        view.setUint16(34, 16, true);
+        
+        // data chunk identifier
         writeString(view, 36, 'data');
-        view.setUint32(40, samples.length * 2, true);
-        floatTo16BitPCM(view, 44, samples);
         
-        return buffer;
+        // data chunk length
+        view.setUint32(40, audioData.length * 2, true);
+        
+        // Write the PCM samples
+        floatTo16BitPCM(view, 44, audioData);
+        
+        // Create and return blob
+        return new Blob([view], { type: 'audio/wav' });
+    } catch (err) {
+        console.error('Error in createWavBlob:', err);
+        
+        // Create a minimal valid WAV file with silence as fallback
+        const fallbackSamples = new Float32Array(1024).fill(0);
+        const fallbackBuffer = new ArrayBuffer(44 + fallbackSamples.length * 2);
+        const fallbackView = new DataView(fallbackBuffer);
+        
+        writeString(fallbackView, 0, 'RIFF');
+        fallbackView.setUint32(4, 36 + fallbackSamples.length * 2, true);
+        writeString(fallbackView, 8, 'WAVE');
+        writeString(fallbackView, 12, 'fmt ');
+        fallbackView.setUint32(16, 16, true);
+        fallbackView.setUint16(20, 1, true);
+        fallbackView.setUint16(22, 1, true);
+        fallbackView.setUint32(24, sampleRate, true);
+        fallbackView.setUint32(28, sampleRate * 2, true);
+        fallbackView.setUint16(32, 2, true);
+        fallbackView.setUint16(34, 16, true);
+        writeString(fallbackView, 36, 'data');
+        fallbackView.setUint32(40, fallbackSamples.length * 2, true);
+        floatTo16BitPCM(fallbackView, 44, fallbackSamples);
+        
+        return new Blob([fallbackView], { type: 'audio/wav' });
     }
-    
-    // Convert audio data to TypedArray if it's a regular Array
-    const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
-    
-    // Create WAV blob
-    const wavBuffer = encodeWAV(samples);
-    return new Blob([wavBuffer], { type: 'audio/wav' });
 }
 
 // Draw audio visualizer

From 230117a0225b9df857810defbcfa9487a3bf6755 Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sun, 30 Mar 2025 00:14:47 -0400
Subject: [PATCH 12/16] Demo Update 4

---
 Backend/server.py | 150 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 107 insertions(+), 43 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index b638e99..a6b70a3 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -16,56 +16,91 @@ import gc
 from collections import deque
 from threading import Lock
 
-# Add these lines right after your imports
-import torch
-import os
+# Add this at the top of your file, replacing your current CUDA setup
 
-# Handle CUDA issues
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Limit to first GPU only
-torch.backends.cudnn.benchmark = True
-
-# Set CUDA settings to avoid TF32 warnings
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-
-# Set compute type based on available hardware
-if torch.cuda.is_available():
-    device = "cuda"
-    compute_type = "float16"  # Faster for CUDA
-else:
+# CUDA setup with robust error handling
+try:
+    # Handle CUDA issues
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Limit to first GPU only
+    
+    # Try enabling TF32 precision
+    try:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    except:
+        pass  # Ignore if not supported
+    
+    # Check if CUDA is available
+    if torch.cuda.is_available():
+        try:
+            # Test CUDA functionality
+            x = torch.rand(10, device="cuda")
+            y = x + x
+            del x, y
+            device = "cuda"
+            compute_type = "float16"
+            print("CUDA is fully functional")
+        except Exception as cuda_error:
+            print(f"CUDA is available but not working correctly: {str(cuda_error)}")
+            device = "cpu"
+            compute_type = "int8"
+    else:
+        device = "cpu"
+        compute_type = "int8"
+except Exception as e:
+    print(f"Error setting up CUDA: {str(e)}")
     device = "cpu"
-    compute_type = "int8"  # Better for CPU
+    compute_type = "int8"
 
 print(f"Using device: {device} with compute type: {compute_type}")
 
-# Select device
-if torch.cuda.is_available():
-    device = "cuda"
-else:
-    device = "cpu"
-print(f"Using device: {device}")
+# Initialize the Sesame CSM model with robust error handling
+try:
+    print(f"Loading Sesame CSM model on {device}...")
+    generator = load_csm_1b(device=device)
+    print("Sesame CSM model loaded successfully")
+except Exception as model_error:
+    print(f"Error loading Sesame CSM on {device}: {str(model_error)}")
+    if device == "cuda":
+        # Try on CPU as fallback
+        try:
+            print("Trying to load Sesame CSM on CPU instead...")
+            device = "cpu"  # Update global device setting
+            generator = load_csm_1b(device="cpu")
+            print("Sesame CSM model loaded on CPU successfully")
+        except Exception as cpu_error:
+            print(f"Fatal error - could not load Sesame CSM model: {str(cpu_error)}")
+            raise RuntimeError("Failed to load speech synthesis model")
+    else:
+        # Already tried CPU and it failed
+        raise RuntimeError("Failed to load speech synthesis model on any device")
 
-# Initialize the model
-generator = load_csm_1b(device=device)
-
-# Initialize WhisperX for ASR
+# Initialize WhisperX for ASR with robust error handling
 print("Loading WhisperX model...")
 try:
-    # Try to load a smaller model for faster response times
-    asr_model = whisperx.load_model("small", device, compute_type=compute_type)
-    print("WhisperX 'small' model loaded successfully")
+    # First try the smallest model ("tiny") to avoid memory issues
+    asr_model = whisperx.load_model("tiny", device, compute_type=compute_type)
+    print("WhisperX 'tiny' model loaded successfully")
+    
+    # If tiny worked and we have CUDA, try upgrading to small
+    if device == "cuda":
+        try:
+            asr_model = whisperx.load_model("small", device, compute_type=compute_type)
+            print("WhisperX 'small' model loaded successfully")
+        except Exception as upgrade_error:
+            print(f"Staying with 'tiny' model: {str(upgrade_error)}")
 except Exception as e:
-    print(f"Error loading 'small' model: {str(e)}")
+    print(f"Error loading models on {device}: {str(e)}")
+    print("Falling back to CPU model")
     try:
-        # Fall back to tiny model if small fails
-        asr_model = whisperx.load_model("tiny", device, compute_type=compute_type)
-        print("WhisperX 'tiny' model loaded as fallback")
-    except Exception as e2:
-        print(f"Error loading fallback model: {str(e2)}")
-        print("Trying CPU model as last resort")
-        # Last resort - try CPU
+        # Force CPU as last resort
+        device = "cpu"
+        compute_type = "int8"
         asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
         print("WhisperX loaded on CPU as last resort")
+    except Exception as cpu_error:
+        print(f"Fatal error - could not load any model: {str(cpu_error)}")
+        raise RuntimeError("No ASR model could be loaded. Please check your CUDA installation.")
 
 # Silence detection parameters
 SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
@@ -226,7 +261,7 @@ def encode_audio_data(audio_tensor: torch.Tensor) -> str:
 
 
 def transcribe_audio(audio_tensor: torch.Tensor) -> str:
-    """Transcribe audio using WhisperX"""
+    """Transcribe audio using WhisperX with robust error handling"""
     try:
         # Save the tensor to a temporary file
         temp_path = os.path.join(base_dir, "temp_audio.wav")
@@ -234,9 +269,38 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
         
         print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)")
         
-        # Load and transcribe the audio
-        audio = whisperx.load_audio(temp_path)
-        result = asr_model.transcribe(audio, batch_size=16)
+        # Load the audio file using whisperx's function
+        try:
+            audio = whisperx.load_audio(temp_path)
+        except Exception as audio_load_error:
+            print(f"WhisperX load_audio failed: {str(audio_load_error)}")
+            # Fall back to manual loading
+            import soundfile as sf
+            audio, sr = sf.read(temp_path)
+            if sr != 16000:  # WhisperX expects 16kHz audio
+                from scipy import signal
+                audio = signal.resample(audio, int(len(audio) * 16000 / sr))
+        
+        # Transcribe with error handling for CUDA issues
+        try:
+            # Try with original device
+            result = asr_model.transcribe(audio, batch_size=8)
+        except RuntimeError as cuda_error:
+            if "CUDA" in str(cuda_error) or "libcudnn" in str(cuda_error):
+                print(f"CUDA error in transcription, falling back to CPU: {str(cuda_error)}")
+                
+                # Try to load a CPU model as fallback
+                try:
+                    global asr_model
+                    # Move model to CPU and try again
+                    asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
+                    result = asr_model.transcribe(audio, batch_size=1)
+                except Exception as e:
+                    print(f"CPU fallback also failed: {str(e)}")
+                    return "I'm having trouble processing audio right now."
+            else:
+                # Re-raise if it's not a CUDA error
+                raise
         
         # Clean up
         if os.path.exists(temp_path):
@@ -257,7 +321,7 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
         traceback.print_exc()
         if os.path.exists("temp_audio.wav"):
             os.remove("temp_audio.wav")
-        return ""
+        return "I heard something but couldn't understand it."
 
 
 def generate_response(text: str, conversation_history: List[Segment]) -> str:

From bb5e0c4765f010d4bd313d1b4d7198e43c764ac5 Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sun, 30 Mar 2025 00:17:39 -0400
Subject: [PATCH 13/16] Demo Fixes 1

---
 Backend/server.py | 60 +++++++++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index a6b70a3..d0dee80 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -75,32 +75,51 @@ except Exception as model_error:
         # Already tried CPU and it failed
         raise RuntimeError("Failed to load speech synthesis model on any device")
 
+# Replace the WhisperX model loading section
+
 # Initialize WhisperX for ASR with robust error handling
 print("Loading WhisperX model...")
+asr_model = None  # Initialize to None first to avoid scope issues
+
 try:
-    # First try the smallest model ("tiny") to avoid memory issues
-    asr_model = whisperx.load_model("tiny", device, compute_type=compute_type)
-    print("WhisperX 'tiny' model loaded successfully")
+    # Always start with the tiny model on CPU for stability
+    asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
+    print("WhisperX 'tiny' model loaded on CPU successfully")
     
-    # If tiny worked and we have CUDA, try upgrading to small
+    # If CPU works, try CUDA if available
     if device == "cuda":
         try:
-            asr_model = whisperx.load_model("small", device, compute_type=compute_type)
-            print("WhisperX 'small' model loaded successfully")
-        except Exception as upgrade_error:
-            print(f"Staying with 'tiny' model: {str(upgrade_error)}")
+            print("Trying to load WhisperX on CUDA...")
+            cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16")
+            # Test the model to ensure it works
+            test_audio = torch.zeros(16000)  # 1 second of silence at 16kHz
+            _ = cuda_model.transcribe(test_audio.numpy(), batch_size=1)
+            # If we get here, CUDA works
+            asr_model = cuda_model
+            print("WhisperX model moved to CUDA successfully")
+            
+            # Try to upgrade to small model on CUDA
+            try:
+                small_model = whisperx.load_model("small", "cuda", compute_type="float16")
+                # Test it
+                _ = small_model.transcribe(test_audio.numpy(), batch_size=1)
+                asr_model = small_model
+                print("WhisperX 'small' model loaded on CUDA successfully")
+            except Exception as upgrade_error:
+                print(f"Staying with 'tiny' model on CUDA: {str(upgrade_error)}")
+        except Exception as cuda_error:
+            print(f"CUDA loading failed, staying with CPU model: {str(cuda_error)}")
 except Exception as e:
-    print(f"Error loading models on {device}: {str(e)}")
-    print("Falling back to CPU model")
-    try:
-        # Force CPU as last resort
-        device = "cpu"
-        compute_type = "int8"
-        asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
-        print("WhisperX loaded on CPU as last resort")
-    except Exception as cpu_error:
-        print(f"Fatal error - could not load any model: {str(cpu_error)}")
-        raise RuntimeError("No ASR model could be loaded. Please check your CUDA installation.")
+    print(f"Error loading WhisperX model: {str(e)}")
+    # Create a minimal dummy model as last resort
+    class DummyModel:
+        def __init__(self):
+            self.device = "cpu"
+        def transcribe(self, *args, **kwargs):
+            return {"segments": [{"text": "Speech recognition currently unavailable."}]}
+    
+    asr_model = DummyModel()
+    print("WARNING: Using dummy transcription model - ASR functionality limited")
 
 # Silence detection parameters
 SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
@@ -262,6 +281,8 @@ def encode_audio_data(audio_tensor: torch.Tensor) -> str:
 
 def transcribe_audio(audio_tensor: torch.Tensor) -> str:
     """Transcribe audio using WhisperX with robust error handling"""
+    global asr_model  # Declare global at the beginning of the function
+    
     try:
         # Save the tensor to a temporary file
         temp_path = os.path.join(base_dir, "temp_audio.wav")
@@ -291,7 +312,6 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
                 
                 # Try to load a CPU model as fallback
                 try:
-                    global asr_model
                     # Move model to CPU and try again
                     asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
                     result = asr_model.transcribe(audio, batch_size=1)

From adbc3c89d69a04f542955ccb2bf963b2dc4b366a Mon Sep 17 00:00:00 2001
From: Surya Vemulapalli <iciclecrash50@gmail.com>
Date: Sun, 30 Mar 2025 00:20:15 -0400
Subject: [PATCH 14/16] Added mongoose

---
 React/bun.lock         | 39 +++++++++++++++++++++++++
 React/package.json     |  1 +
 React/src/app/page.tsx | 66 +++++++++++++++++++++++++++++++++++-------
 3 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/React/bun.lock b/React/bun.lock
index dca1020..4b4fd3e 100644
--- a/React/bun.lock
+++ b/React/bun.lock
@@ -5,6 +5,7 @@
       "name": "my-app",
       "dependencies": {
         "@auth0/nextjs-auth0": "^4.3.0",
+        "mongoose": "^8.13.1",
         "next": "15.2.4",
         "react": "^19.1.0",
         "react-dom": "^19.1.0",
@@ -68,6 +69,8 @@
 
     "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.33.5", "", { "os": "win32", "cpu": "x64" }, "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg=="],
 
+    "@mongodb-js/saslprep": ["@mongodb-js/saslprep@1.2.0", "", { "dependencies": { "sparse-bitfield": "^3.0.3" } }, "sha512-+ywrb0AqkfaYuhHs6LxKWgqbh3I72EpEgESCw37o+9qPx9WTCkgDm2B+eMrwehGtHBWHFU4GXvnSCNiFhhausg=="],
+
     "@next/env": ["@next/env@15.2.4", "", {}, "sha512-+SFtMgoiYP3WoSswuNmxJOCwi06TdWE733D+WPjpXIe4LXGULwEaofiiAy6kbS0+XjM5xF5n3lKuBwN2SnqD9g=="],
 
     "@next/swc-darwin-arm64": ["@next/swc-darwin-arm64@15.2.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-1AnMfs655ipJEDC/FHkSr0r3lXBgpqKo4K1kiwfUf3iE68rDFXZ1TtHdMvf7D0hMItgDZ7Vuq3JgNMbt/+3bYw=="],
@@ -130,6 +133,10 @@
 
     "@types/react-dom": ["@types/react-dom@19.0.4", "", { "peerDependencies": { "@types/react": "^19.0.0" } }, "sha512-4fSQ8vWFkg+TGhePfUzVmat3eC14TXYSsiiDSLI0dVLsrm9gZFABjPy/Qu6TKgl1tq1Bu1yDsuQgY3A3DOjCcg=="],
 
+    "@types/webidl-conversions": ["@types/webidl-conversions@7.0.3", "", {}, "sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA=="],
+
+    "@types/whatwg-url": ["@types/whatwg-url@11.0.5", "", { "dependencies": { "@types/webidl-conversions": "*" } }, "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ=="],
+
     "@zag-js/accordion": ["@zag-js/accordion@1.7.0", "", { "dependencies": { "@zag-js/anatomy": "1.7.0", "@zag-js/core": "1.7.0", "@zag-js/dom-query": "1.7.0", "@zag-js/types": "1.7.0", "@zag-js/utils": "1.7.0" } }, "sha512-LNJOjLTW2KwrToXBrXIbNIAiISA94n0AdWp14H8RrskdokywmEGiC0GgWTGEJ7DNA6TGP6Ae5o9rJ4fHSmCsDQ=="],
 
     "@zag-js/anatomy": ["@zag-js/anatomy@1.7.0", "", {}, "sha512-fkRgH6vPCwykmRdV38uAJeTtJc8tayAnURfoovHAtB9bK0goagPbpdcYTNyGn8msul0h+KBloOtnw4obvX0nPw=="],
@@ -182,6 +189,8 @@
 
     "@zag-js/utils": ["@zag-js/utils@1.7.0", "", {}, "sha512-yIxvH5V27a1WuLgCxHX7qpdtFo8vTJaZLafBpSNfVYG4B8FaxTE+P7JAcpmAzs3UyXura/WfAY2eVWWVBpk9ZA=="],
 
+    "bson": ["bson@6.10.3", "", {}, "sha512-MTxGsqgYTwfshYWTRdmZRC+M7FnG1b4y7RO7p2k3X24Wq0yv1m77Wsj0BzlPzd/IowgESfsruQCUToa7vbOpPQ=="],
+
     "busboy": ["busboy@1.6.0", "", { "dependencies": { "streamsearch": "^1.1.0" } }, "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA=="],
 
     "caniuse-lite": ["caniuse-lite@1.0.30001707", "", {}, "sha512-3qtRjw/HQSMlDWf+X79N206fepf4SOOU6SQLMaq/0KkZLmSjPxAkBOQQ+FxbHKfHmYLZFfdWsO3KA90ceHPSnw=="],
@@ -198,6 +207,8 @@
 
     "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
 
+    "debug": ["debug@4.4.0", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA=="],
+
     "dequal": ["dequal@2.0.3", "", {}, "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA=="],
 
     "detect-libc": ["detect-libc@2.0.3", "", {}, "sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw=="],
@@ -212,6 +223,8 @@
 
     "jose": ["jose@5.10.0", "", {}, "sha512-s+3Al/p9g32Iq+oqXxkW//7jk2Vig6FF1CFqzVXoTUXt2qz89YWbL+OwS17NFYEvxC35n0FKeGO2LGYSxeM2Gg=="],
 
+    "kareem": ["kareem@2.6.3", "", {}, "sha512-C3iHfuGUXK2u8/ipq9LfjFfXFxAZMQJJq7vLS45r3D9Y2xQ/m4S8zaR4zMLFWh9AsNPXmcFfUDhTEO8UIC/V6Q=="],
+
     "lightningcss": ["lightningcss@1.29.2", "", { "dependencies": { "detect-libc": "^2.0.3" }, "optionalDependencies": { "lightningcss-darwin-arm64": "1.29.2", "lightningcss-darwin-x64": "1.29.2", "lightningcss-freebsd-x64": "1.29.2", "lightningcss-linux-arm-gnueabihf": "1.29.2", "lightningcss-linux-arm64-gnu": "1.29.2", "lightningcss-linux-arm64-musl": "1.29.2", "lightningcss-linux-x64-gnu": "1.29.2", "lightningcss-linux-x64-musl": "1.29.2", "lightningcss-win32-arm64-msvc": "1.29.2", "lightningcss-win32-x64-msvc": "1.29.2" } }, "sha512-6b6gd/RUXKaw5keVdSEtqFVdzWnU5jMxTUjA2bVcMNPLwSQ08Sv/UodBVtETLCn7k4S1Ibxwh7k68IwLZPgKaA=="],
 
     "lightningcss-darwin-arm64": ["lightningcss-darwin-arm64@1.29.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-cK/eMabSViKn/PG8U/a7aCorpeKLMlK0bQeNHmdb7qUnBkNPnL+oV5DjJUo0kqWsJUapZsM4jCfYItbqBDvlcA=="],
@@ -234,6 +247,20 @@
 
     "lightningcss-win32-x64-msvc": ["lightningcss-win32-x64-msvc@1.29.2", "", { "os": "win32", "cpu": "x64" }, "sha512-EdIUW3B2vLuHmv7urfzMI/h2fmlnOQBk1xlsDxkN1tCWKjNFjfLhGxYk8C8mzpSfr+A6jFFIi8fU6LbQGsRWjA=="],
 
+    "memory-pager": ["memory-pager@1.5.0", "", {}, "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg=="],
+
+    "mongodb": ["mongodb@6.15.0", "", { "dependencies": { "@mongodb-js/saslprep": "^1.1.9", "bson": "^6.10.3", "mongodb-connection-string-url": "^3.0.0" }, "peerDependencies": { "@aws-sdk/credential-providers": "^3.188.0", "@mongodb-js/zstd": "^1.1.0 || ^2.0.0", "gcp-metadata": "^5.2.0", "kerberos": "^2.0.1", "mongodb-client-encryption": ">=6.0.0 <7", "snappy": "^7.2.2", "socks": "^2.7.1" }, "optionalPeers": ["@aws-sdk/credential-providers", "@mongodb-js/zstd", "gcp-metadata", "kerberos", "mongodb-client-encryption", "snappy", "socks"] }, "sha512-ifBhQ0rRzHDzqp9jAQP6OwHSH7dbYIQjD3SbJs9YYk9AikKEettW/9s/tbSFDTpXcRbF+u1aLrhHxDFaYtZpFQ=="],
+
+    "mongodb-connection-string-url": ["mongodb-connection-string-url@3.0.2", "", { "dependencies": { "@types/whatwg-url": "^11.0.2", "whatwg-url": "^14.1.0 || ^13.0.0" } }, "sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA=="],
+
+    "mongoose": ["mongoose@8.13.1", "", { "dependencies": { "bson": "^6.10.3", "kareem": "2.6.3", "mongodb": "~6.15.0", "mpath": "0.9.0", "mquery": "5.0.0", "ms": "2.1.3", "sift": "17.1.3" } }, "sha512-sRqlXI+6jhr9/KicCOjet1VVPONFsOxTrh14tfueX5y3GJ2ihswc5ewUUojuwdSS/5koGXLIPmGivDSApVXflA=="],
+
+    "mpath": ["mpath@0.9.0", "", {}, "sha512-ikJRQTk8hw5DEoFVxHG1Gn9T/xcjtdnOKIU1JTmGjZZlg9LST2mBLmcX3/ICIbgJydT2GOc15RnNy5mHmzfSew=="],
+
+    "mquery": ["mquery@5.0.0", "", { "dependencies": { "debug": "4.x" } }, "sha512-iQMncpmEK8R8ncT8HJGsGc9Dsp8xcgYMVSbs5jgnm1lFHTZqMJTUWTDx1LBO8+mK3tPNZWFLBghQEIOULSTHZg=="],
+
+    "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
+
     "nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="],
 
     "next": ["next@15.2.4", "", { "dependencies": { "@next/env": "15.2.4", "@swc/counter": "0.1.3", "@swc/helpers": "0.5.15", "busboy": "1.6.0", "caniuse-lite": "^1.0.30001579", "postcss": "8.4.31", "styled-jsx": "5.1.6" }, "optionalDependencies": { "@next/swc-darwin-arm64": "15.2.4", "@next/swc-darwin-x64": "15.2.4", "@next/swc-linux-arm64-gnu": "15.2.4", "@next/swc-linux-arm64-musl": "15.2.4", "@next/swc-linux-x64-gnu": "15.2.4", "@next/swc-linux-x64-musl": "15.2.4", "@next/swc-win32-arm64-msvc": "15.2.4", "@next/swc-win32-x64-msvc": "15.2.4", "sharp": "^0.33.5" }, "peerDependencies": { "@opentelemetry/api": "^1.1.0", "@playwright/test": "^1.41.2", "babel-plugin-react-compiler": "*", "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "sass": "^1.3.0" }, "optionalPeers": ["@opentelemetry/api", "@playwright/test", "babel-plugin-react-compiler", "sass"], "bin": { "next": "dist/bin/next" } }, "sha512-VwL+LAaPSxEkd3lU2xWbgEOtrM8oedmyhBqaVNmgKB+GvZlCy9rgaEc+y2on0wv+l0oSFqLtYD6dcC1eAedUaQ=="],
@@ -246,6 +273,8 @@
 
     "proxy-compare": ["proxy-compare@3.0.1", "", {}, "sha512-V9plBAt3qjMlS1+nC8771KNf6oJ12gExvaxnNzN/9yVRLdTv/lc+oJlnSzrdYDAvBfTStPCoiaCOTmTs0adv7Q=="],
 
+    "punycode": ["punycode@2.3.1", "", {}, "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg=="],
+
     "react": ["react@19.1.0", "", {}, "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg=="],
 
     "react-dom": ["react-dom@19.1.0", "", { "dependencies": { "scheduler": "^0.26.0" }, "peerDependencies": { "react": "^19.1.0" } }, "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g=="],
@@ -256,10 +285,14 @@
 
     "sharp": ["sharp@0.33.5", "", { "dependencies": { "color": "^4.2.3", "detect-libc": "^2.0.3", "semver": "^7.6.3" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "0.33.5", "@img/sharp-darwin-x64": "0.33.5", "@img/sharp-libvips-darwin-arm64": "1.0.4", "@img/sharp-libvips-darwin-x64": "1.0.4", "@img/sharp-libvips-linux-arm": "1.0.5", "@img/sharp-libvips-linux-arm64": "1.0.4", "@img/sharp-libvips-linux-s390x": "1.0.4", "@img/sharp-libvips-linux-x64": "1.0.4", "@img/sharp-libvips-linuxmusl-arm64": "1.0.4", "@img/sharp-libvips-linuxmusl-x64": "1.0.4", "@img/sharp-linux-arm": "0.33.5", "@img/sharp-linux-arm64": "0.33.5", "@img/sharp-linux-s390x": "0.33.5", "@img/sharp-linux-x64": "0.33.5", "@img/sharp-linuxmusl-arm64": "0.33.5", "@img/sharp-linuxmusl-x64": "0.33.5", "@img/sharp-wasm32": "0.33.5", "@img/sharp-win32-ia32": "0.33.5", "@img/sharp-win32-x64": "0.33.5" } }, "sha512-haPVm1EkS9pgvHrQ/F3Xy+hgcuMV0Wm9vfIBSiwZ05k+xgb0PkBQpGsAA/oWdDobNaZTH5ppvHtzCFbnSEwHVw=="],
 
+    "sift": ["sift@17.1.3", "", {}, "sha512-Rtlj66/b0ICeFzYTuNvX/EF1igRbbnGSvEyT79McoZa/DeGhMyC5pWKOEsZKnpkqtSeovd5FL/bjHWC3CIIvCQ=="],
+
     "simple-swizzle": ["simple-swizzle@0.2.2", "", { "dependencies": { "is-arrayish": "^0.3.1" } }, "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg=="],
 
     "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="],
 
+    "sparse-bitfield": ["sparse-bitfield@3.0.3", "", { "dependencies": { "memory-pager": "^1.0.2" } }, "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ=="],
+
     "streamsearch": ["streamsearch@1.1.0", "", {}, "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg=="],
 
     "styled-jsx": ["styled-jsx@5.1.6", "", { "dependencies": { "client-only": "0.0.1" }, "peerDependencies": { "react": ">= 16.8.0 || 17.x.x || ^18.0.0-0 || ^19.0.0-0" } }, "sha512-qSVyDTeMotdvQYoHWLNGwRFJHC+i+ZvdBRYosOFgC+Wg1vx4frN2/RG/NA7SYqqvKNLf39P2LSRA2pu6n0XYZA=="],
@@ -270,6 +303,8 @@
 
     "tapable": ["tapable@2.2.1", "", {}, "sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ=="],
 
+    "tr46": ["tr46@5.1.0", "", { "dependencies": { "punycode": "^2.3.1" } }, "sha512-IUWnUK7ADYR5Sl1fZlO1INDUhVhatWl7BtJWsIhwJ0UAK7ilzzIa8uIqOO/aYVWHZPJkKbEL+362wrzoeRF7bw=="],
+
     "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="],
 
     "typescript": ["typescript@5.8.2", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ=="],
@@ -278,6 +313,10 @@
 
     "use-sync-external-store": ["use-sync-external-store@1.5.0", "", { "peerDependencies": { "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A=="],
 
+    "webidl-conversions": ["webidl-conversions@7.0.0", "", {}, "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g=="],
+
+    "whatwg-url": ["whatwg-url@14.2.0", "", { "dependencies": { "tr46": "^5.1.0", "webidl-conversions": "^7.0.0" } }, "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw=="],
+
     "next/postcss": ["postcss@8.4.31", "", { "dependencies": { "nanoid": "^3.3.6", "picocolors": "^1.0.0", "source-map-js": "^1.0.2" } }, "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ=="],
   }
 }
diff --git a/React/package.json b/React/package.json
index a3f4b26..0d75192 100644
--- a/React/package.json
+++ b/React/package.json
@@ -10,6 +10,7 @@
   },
   "dependencies": {
     "@auth0/nextjs-auth0": "^4.3.0",
+    "mongoose": "^8.13.1",
     "next": "15.2.4",
     "react": "^19.1.0",
     "react-dom": "^19.1.0"
diff --git a/React/src/app/page.tsx b/React/src/app/page.tsx
index fcd37b0..2a20b5d 100644
--- a/React/src/app/page.tsx
+++ b/React/src/app/page.tsx
@@ -40,14 +40,29 @@ export default async function Home() {
 					type="submit">Set codeword</button>
 				</form>
 				{/* form for adding contacts */}
-				<form className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()}>
+				<form className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()}>
 					<input
 						type="text"
 						value={contacts}
 						onChange={(e) => setContacts(e.target.value.split(","))}
-						placeholder="contacts (comma separated)"
+						placeholder="Write down an emergency contact"
 						className="border border-gray-300 rounded-md p-2"
 					/>
+					<input
+						type="text"
+						value={contacts}
+						onChange={(e) => setContacts(e.target.value.split(","))}
+						placeholder="Write down an emergency contact"
+						className="border border-gray-300 rounded-md p-2"
+					/>
+					<input
+						type="text"
+						value={contacts}
+						onChange={(e) => setContacts(e.target.value.split(","))}
+						placeholder="Write down an emergency contact"
+						className="border border-gray-300 rounded-md p-2"
+					/>
+					<button type="button">Add</button>
 					<button className="bg-slate-500 text-yellow-300 text-stretch-50% font-lg rounded-md p-2" type="submit">Set contacts</button>
 				</form>
 			</div>
@@ -76,14 +91,45 @@ export default async function Home() {
 						type="submit">Set codeword</button>
 					</form>
 					{/* form for adding contacts */}
-					<form className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()}>
-						<input
-							type="text"
-							value={contacts}
-							onChange={(e) => setContacts(e.target.value.split(","))}
-							placeholder="contacts (comma separated)"
-							className="border border-gray-300 rounded-md p-2"
-						/>
+					<form id="Contacts" className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()}>
+					<input
+						type="text"
+						value={contacts}
+						onChange={(e) => setContacts(e.target.value.split(","))}
+						placeholder="Write down an emergency contact"
+						className="border border-gray-300 rounded-md p-2"
+					/>
+					<input
+						type="text"
+						value={contacts}
+						onChange={(e) => setContacts(e.target.value.split(","))}
+						placeholder="Write down an emergency contact"
+						className="border border-gray-300 rounded-md p-2"
+					/>
+					<input
+						type="text"
+						value={contacts}
+						onChange={(e) => setContacts(e.target.value.split(","))}
+						placeholder="Write down an emergency contact"
+						className="border border-gray-300 rounded-md p-2"
+					/>
+					<input
+						type="text"
+						value={contacts}
+						onChange={(e) => setContacts(e.target.value.split(","))}
+						placeholder="Write down an emergency contact"
+						className="text-input border border-gray-300 rounded-md p-2"
+					/>
+					<button onClick={() => {
+						alert("Adding contact...");
+						let elem = document.getElementsByClassName("text-input")[0] as HTMLElement;
+						console.log("Element:", elem);
+						let d = elem.cloneNode(true) as HTMLElement;
+						document.getElementById("Contacts")?.appendChild(d);
+					}}
+					className="bg-emerald-500 text-fuchsia-300"
+					type="button">Add</button>
+					
 						<button className="bg-slate-500 text-yellow-300 text-stretch-50% font-lg rounded-md p-2" type="submit">Set contacts</button>
 					</form>
 				

From 6152e300c000793d3f5682dda2ac1431fc03a12e Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sun, 30 Mar 2025 00:24:26 -0400
Subject: [PATCH 15/16] Demo Update 6

---
 Backend/server.py | 659 +++++++++++++++++++++++-----------------------
 1 file changed, 335 insertions(+), 324 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index d0dee80..8ba56b4 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -1,9 +1,13 @@
 import os
 import base64
 import json
+import time
+import math
+import gc
+import logging
+import numpy as np
 import torch
 import torchaudio
-import numpy as np
 import whisperx
 from io import BytesIO
 from typing import List, Dict, Any, Optional
@@ -11,290 +15,314 @@ from flask import Flask, request, send_from_directory, Response
 from flask_cors import CORS
 from flask_socketio import SocketIO, emit, disconnect
 from generator import load_csm_1b, Segment
-import time
-import gc
 from collections import deque
 from threading import Lock
 
-# Add this at the top of your file, replacing your current CUDA setup
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger("sesame-server")
 
-# CUDA setup with robust error handling
-try:
-    # Handle CUDA issues
-    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Limit to first GPU only
+# CUDA Environment Setup
+def setup_cuda_environment():
+    """Set up CUDA environment with proper error handling"""
+    # Search for CUDA libraries in common locations
+    cuda_lib_dirs = [
+        "/usr/local/cuda/lib64",
+        "/usr/lib/x86_64-linux-gnu",
+        "/usr/local/cuda/extras/CUPTI/lib64"
+    ]
     
-    # Try enabling TF32 precision
-    try:
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-    except:
-        pass  # Ignore if not supported
+    # Add directories to LD_LIBRARY_PATH if they exist
+    current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
+    for cuda_dir in cuda_lib_dirs:
+        if os.path.exists(cuda_dir) and cuda_dir not in current_ld_path:
+            if current_ld_path:
+                os.environ['LD_LIBRARY_PATH'] = f"{current_ld_path}:{cuda_dir}"
+            else:
+                os.environ['LD_LIBRARY_PATH'] = cuda_dir
+            current_ld_path = os.environ['LD_LIBRARY_PATH']
     
-    # Check if CUDA is available
-    if torch.cuda.is_available():
-        try:
-            # Test CUDA functionality
-            x = torch.rand(10, device="cuda")
-            y = x + x
-            del x, y
-            device = "cuda"
-            compute_type = "float16"
-            print("CUDA is fully functional")
-        except Exception as cuda_error:
-            print(f"CUDA is available but not working correctly: {str(cuda_error)}")
-            device = "cpu"
-            compute_type = "int8"
-    else:
-        device = "cpu"
-        compute_type = "int8"
-except Exception as e:
-    print(f"Error setting up CUDA: {str(e)}")
+    logger.info(f"LD_LIBRARY_PATH set to: {os.environ.get('LD_LIBRARY_PATH', 'not set')}")
+    
+    # Determine best compute device
     device = "cpu"
     compute_type = "int8"
-
-print(f"Using device: {device} with compute type: {compute_type}")
-
-# Initialize the Sesame CSM model with robust error handling
-try:
-    print(f"Loading Sesame CSM model on {device}...")
-    generator = load_csm_1b(device=device)
-    print("Sesame CSM model loaded successfully")
-except Exception as model_error:
-    print(f"Error loading Sesame CSM on {device}: {str(model_error)}")
-    if device == "cuda":
-        # Try on CPU as fallback
-        try:
-            print("Trying to load Sesame CSM on CPU instead...")
-            device = "cpu"  # Update global device setting
-            generator = load_csm_1b(device="cpu")
-            print("Sesame CSM model loaded on CPU successfully")
-        except Exception as cpu_error:
-            print(f"Fatal error - could not load Sesame CSM model: {str(cpu_error)}")
-            raise RuntimeError("Failed to load speech synthesis model")
-    else:
-        # Already tried CPU and it failed
-        raise RuntimeError("Failed to load speech synthesis model on any device")
-
-# Replace the WhisperX model loading section
-
-# Initialize WhisperX for ASR with robust error handling
-print("Loading WhisperX model...")
-asr_model = None  # Initialize to None first to avoid scope issues
-
-try:
-    # Always start with the tiny model on CPU for stability
-    asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
-    print("WhisperX 'tiny' model loaded on CPU successfully")
     
-    # If CPU works, try CUDA if available
-    if device == "cuda":
+    try:
+        # Set CUDA preferences
+        os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Limit to first GPU only
+        
+        # Try enabling TF32 precision if available
         try:
-            print("Trying to load WhisperX on CUDA...")
-            cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16")
-            # Test the model to ensure it works
-            test_audio = torch.zeros(16000)  # 1 second of silence at 16kHz
-            _ = cuda_model.transcribe(test_audio.numpy(), batch_size=1)
-            # If we get here, CUDA works
-            asr_model = cuda_model
-            print("WhisperX model moved to CUDA successfully")
-            
-            # Try to upgrade to small model on CUDA
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            torch.backends.cudnn.enabled = True
+            torch.backends.cudnn.benchmark = True
+        except Exception as e:
+            logger.warning(f"Could not set advanced CUDA options: {e}")
+        
+        # Test if CUDA is functional
+        if torch.cuda.is_available():
             try:
-                small_model = whisperx.load_model("small", "cuda", compute_type="float16")
-                # Test it
-                _ = small_model.transcribe(test_audio.numpy(), batch_size=1)
-                asr_model = small_model
-                print("WhisperX 'small' model loaded on CUDA successfully")
-            except Exception as upgrade_error:
-                print(f"Staying with 'tiny' model on CUDA: {str(upgrade_error)}")
-        except Exception as cuda_error:
-            print(f"CUDA loading failed, staying with CPU model: {str(cuda_error)}")
-except Exception as e:
-    print(f"Error loading WhisperX model: {str(e)}")
-    # Create a minimal dummy model as last resort
-    class DummyModel:
-        def __init__(self):
-            self.device = "cpu"
-        def transcribe(self, *args, **kwargs):
-            return {"segments": [{"text": "Speech recognition currently unavailable."}]}
+                # Test basic CUDA operations
+                x = torch.rand(10, device="cuda")
+                y = x + x
+                del x, y
+                torch.cuda.empty_cache()
+                device = "cuda"
+                compute_type = "float16"
+                logger.info("CUDA is fully functional")
+            except Exception as e:
+                logger.warning(f"CUDA available but not working correctly: {e}")
+                device = "cpu"
+        else:
+            logger.info("CUDA is not available, using CPU")
+    except Exception as e:
+        logger.error(f"Error setting up computing environment: {e}")
     
-    asr_model = DummyModel()
-    print("WARNING: Using dummy transcription model - ASR functionality limited")
+    return device, compute_type
 
-# Silence detection parameters
-SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
-SILENCE_DURATION_SEC = 1.0  # How long silence must persist
+# Set up the compute environment
+device, compute_type = setup_cuda_environment()
 
-# Define the base directory
+# Constants and Configuration
+SILENCE_THRESHOLD = 0.01
+SILENCE_DURATION_SEC = 0.75
+MAX_BUFFER_SIZE = 30  # Maximum chunks to buffer before processing
+CHUNK_SIZE_MS = 500  # Size of audio chunks when streaming responses
+
+# Define the base directory and static files directory
 base_dir = os.path.dirname(os.path.abspath(__file__))
 static_dir = os.path.join(base_dir, "static")
 os.makedirs(static_dir, exist_ok=True)
 
-# Setup Flask
+# Model Loading Functions
+def load_speech_models():
+    """Load all required speech models with fallbacks"""
+    # Load speech generation model (Sesame CSM)
+    try:
+        logger.info(f"Loading Sesame CSM model on {device}...")
+        generator = load_csm_1b(device=device)
+        logger.info("Sesame CSM model loaded successfully")
+    except Exception as e:
+        logger.error(f"Error loading Sesame CSM on {device}: {e}")
+        if device == "cuda":
+            try:
+                logger.info("Trying to load Sesame CSM on CPU instead...")
+                generator = load_csm_1b(device="cpu")
+                logger.info("Sesame CSM model loaded on CPU successfully")
+            except Exception as cpu_error:
+                logger.critical(f"Failed to load speech synthesis model: {cpu_error}")
+                raise RuntimeError("Failed to load speech synthesis model")
+        else:
+            raise RuntimeError("Failed to load speech synthesis model on any device")
+    
+    # Load ASR model (WhisperX)
+    try:
+        logger.info("Loading WhisperX model...")
+        # Start with the tiny model on CPU for reliable initialization
+        asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
+        logger.info("WhisperX 'tiny' model loaded on CPU successfully")
+        
+        # Try upgrading to GPU if available
+        if device == "cuda":
+            try:
+                logger.info("Trying to load WhisperX on CUDA...")
+                # Test with a tiny model first
+                test_audio = torch.zeros(16000)  # 1 second of silence
+                
+                cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16")
+                # Test the model with real inference
+                _ = cuda_model.transcribe(test_audio.numpy(), batch_size=1)
+                asr_model = cuda_model
+                logger.info("WhisperX model running on CUDA successfully")
+                
+                # Try to upgrade to small model
+                try:
+                    small_model = whisperx.load_model("small", "cuda", compute_type="float16")
+                    _ = small_model.transcribe(test_audio.numpy(), batch_size=1)
+                    asr_model = small_model
+                    logger.info("WhisperX 'small' model loaded on CUDA successfully")
+                except Exception as e:
+                    logger.warning(f"Staying with 'tiny' model on CUDA: {e}")
+            except Exception as e:
+                logger.warning(f"CUDA loading failed, staying with CPU model: {e}")
+    except Exception as e:
+        logger.error(f"Error loading WhisperX model: {e}")
+        # Create a minimal dummy model as last resort
+        class DummyModel:
+            def __init__(self):
+                self.device = "cpu"
+            def transcribe(self, *args, **kwargs):
+                return {"segments": [{"text": "Speech recognition currently unavailable."}]}
+        
+        asr_model = DummyModel()
+        logger.warning("Using dummy transcription model - ASR functionality limited")
+    
+    return generator, asr_model
+
+# Load speech models
+generator, asr_model = load_speech_models()
+
+# Set up Flask and Socket.IO
 app = Flask(__name__)
 CORS(app)
 socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
 
 # Socket connection management
-thread = None
 thread_lock = Lock()
 active_clients = {}  # Map client_id to client context
 
-# Helper function to convert audio data
+# Audio Utility Functions
 def decode_audio_data(audio_data: str) -> torch.Tensor:
     """Decode base64 audio data to a torch tensor with improved error handling"""
     try:
         # Skip empty audio data
         if not audio_data or len(audio_data) < 100:
-            print("Empty or too short audio data received")
+            logger.warning("Empty or too short audio data received")
             return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence
             
         # Extract the actual base64 content
         if ',' in audio_data:
-            # Handle data URL format (data:audio/wav;base64,...)
             audio_data = audio_data.split(',')[1]
             
         # Decode base64 audio data
         try:
             binary_data = base64.b64decode(audio_data)
-            print(f"Decoded base64 data: {len(binary_data)} bytes")
+            logger.debug(f"Decoded base64 data: {len(binary_data)} bytes")
             
             # Check if we have enough data for a valid WAV
             if len(binary_data) < 44:  # WAV header is 44 bytes
-                print("Data too small to be a valid WAV file")
+                logger.warning("Data too small to be a valid WAV file")
                 return torch.zeros(generator.sample_rate // 2)
         except Exception as e:
-            print(f"Base64 decoding error: {str(e)}")
+            logger.error(f"Base64 decoding error: {e}")
             return torch.zeros(generator.sample_rate // 2)
         
-        # Save for debugging
-        debug_path = os.path.join(base_dir, "debug_incoming.wav") 
-        with open(debug_path, 'wb') as f:
-            f.write(binary_data)
-        print(f"Saved debug file: {debug_path}")
+        # Multiple approaches to handle audio data
+        audio_tensor = None
+        sample_rate = None
         
-        # Approach 1: Load directly with torchaudio
+        # Approach 1: Direct loading with torchaudio
         try:
             with BytesIO(binary_data) as temp_file:
-                temp_file.seek(0)  # Ensure we're at the start of the buffer
+                temp_file.seek(0)
                 audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
-                print(f"Direct loading success: shape={audio_tensor.shape}, rate={sample_rate}Hz")
+                logger.debug(f"Loaded audio: shape={audio_tensor.shape}, rate={sample_rate}Hz")
                 
-                # Check if audio is valid
+                # Validate tensor
                 if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
-                    raise ValueError("Empty or invalid audio tensor detected")
+                    raise ValueError("Invalid audio tensor")
         except Exception as e:
-            print(f"Direct loading failed: {str(e)}")
+            logger.warning(f"Direct loading failed: {e}")
             
-            # Approach 2: Try to fix/normalize the WAV data
+            # Approach 2: Using wave module and numpy
             try:
-                # Sometimes WAV headers can be malformed, attempt to fix
-                temp_path = os.path.join(base_dir, "temp_fixing.wav")
+                temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav")
                 with open(temp_path, 'wb') as f:
                     f.write(binary_data)
                 
-                # Use a simpler numpy approach as backup
-                import numpy as np
                 import wave
+                with wave.open(temp_path, 'rb') as wf:
+                    n_channels = wf.getnchannels()
+                    sample_width = wf.getsampwidth()
+                    sample_rate = wf.getframerate()
+                    n_frames = wf.getnframes()
+                    frames = wf.readframes(n_frames)
+                    
+                    # Convert to numpy array
+                    if sample_width == 2:  # 16-bit audio
+                        data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
+                    elif sample_width == 1:  # 8-bit audio
+                        data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0
+                    else:
+                        raise ValueError(f"Unsupported sample width: {sample_width}")
+                    
+                    # Convert to mono if needed
+                    if n_channels > 1:
+                        data = data.reshape(-1, n_channels)
+                        data = data.mean(axis=1)
+                    
+                    # Convert to torch tensor
+                    audio_tensor = torch.from_numpy(data)
+                    logger.info(f"Loaded audio using wave: shape={audio_tensor.shape}")
                 
-                try:
-                    with wave.open(temp_path, 'rb') as wf:
-                        n_channels = wf.getnchannels()
-                        sample_width = wf.getsampwidth()
-                        sample_rate = wf.getframerate()
-                        n_frames = wf.getnframes()
-                        
-                        # Read the frames
-                        frames = wf.readframes(n_frames)
-                        print(f"Wave reading: channels={n_channels}, rate={sample_rate}Hz, frames={n_frames}")
-                        
-                        # Convert to numpy and then to torch
-                        if sample_width == 2:  # 16-bit audio
-                            data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
-                        elif sample_width == 1:  # 8-bit audio
-                            data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0
-                        else:
-                            raise ValueError(f"Unsupported sample width: {sample_width}")
-                        
-                        # Convert to mono if needed
-                        if n_channels > 1:
-                            data = data.reshape(-1, n_channels)
-                            data = data.mean(axis=1)
-                        
-                        # Convert to torch tensor
-                        audio_tensor = torch.from_numpy(data)
-                        print(f"Successfully converted with numpy: shape={audio_tensor.shape}")
-                except Exception as wave_error:
-                    print(f"Wave processing failed: {str(wave_error)}")
-                    # Try with torchaudio as last resort
-                    audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
-                
-                # Clean up
+                # Clean up temp file
                 if os.path.exists(temp_path):
                     os.remove(temp_path)
+                    
             except Exception as e2:
-                print(f"All WAV loading methods failed: {str(e2)}")
-                print("Returning silence as fallback")
+                logger.error(f"All audio loading methods failed: {e2}")
                 return torch.zeros(generator.sample_rate // 2)
         
-        # Ensure audio is the right shape (mono)
+        # Format corrections
+        if audio_tensor is None:
+            return torch.zeros(generator.sample_rate // 2)
+            
+        # Ensure audio is mono
         if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1:
             audio_tensor = torch.mean(audio_tensor, dim=0)
         
-        # Ensure we have a 1D tensor
+        # Ensure 1D tensor
         audio_tensor = audio_tensor.squeeze()
             
         # Resample if needed
         if sample_rate != generator.sample_rate:
             try:
-                print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
+                logger.debug(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
                 resampler = torchaudio.transforms.Resample(
                     orig_freq=sample_rate, 
                     new_freq=generator.sample_rate
                 )
                 audio_tensor = resampler(audio_tensor)
             except Exception as e:
-                print(f"Resampling error: {str(e)}")
-                # If resampling fails, just return the original audio
-                # The model can often handle different sample rates
+                logger.warning(f"Resampling error: {e}")
         
         # Normalize audio to avoid issues
         if torch.abs(audio_tensor).max() > 0:
             audio_tensor = audio_tensor / torch.abs(audio_tensor).max()
         
-        print(f"Final audio tensor: shape={audio_tensor.shape}, min={audio_tensor.min().item():.4f}, max={audio_tensor.max().item():.4f}")
         return audio_tensor
     except Exception as e:
-        print(f"Unhandled error in decode_audio_data: {str(e)}")
-        # Return a small silent audio segment as fallback
-        return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence
-
+        logger.error(f"Unhandled error in decode_audio_data: {e}")
+        return torch.zeros(generator.sample_rate // 2)
 
 def encode_audio_data(audio_tensor: torch.Tensor) -> str:
     """Encode torch tensor audio to base64 string"""
-    buf = BytesIO()
-    torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
-    buf.seek(0)
-    audio_base64 = base64.b64encode(buf.read()).decode('utf-8')
-    return f"data:audio/wav;base64,{audio_base64}"
-
+    try:
+        buf = BytesIO()
+        torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
+        buf.seek(0)
+        audio_base64 = base64.b64encode(buf.read()).decode('utf-8')
+        return f"data:audio/wav;base64,{audio_base64}"
+    except Exception as e:
+        logger.error(f"Error encoding audio: {e}")
+        # Return a minimal silent audio file
+        silence = torch.zeros(generator.sample_rate // 2).unsqueeze(0)
+        buf = BytesIO()
+        torchaudio.save(buf, silence, generator.sample_rate, format="wav")
+        buf.seek(0)
+        return f"data:audio/wav;base64,{base64.b64encode(buf.read()).decode('utf-8')}"
 
 def transcribe_audio(audio_tensor: torch.Tensor) -> str:
     """Transcribe audio using WhisperX with robust error handling"""
-    global asr_model  # Declare global at the beginning of the function
+    global asr_model
     
     try:
         # Save the tensor to a temporary file
-        temp_path = os.path.join(base_dir, "temp_audio.wav")
+        temp_path = os.path.join(base_dir, f"temp_audio_{time.time()}.wav")
         torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
         
-        print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)")
+        logger.info(f"Transcribing audio file: {os.path.getsize(temp_path)} bytes")
         
-        # Load the audio file using whisperx's function
+        # Load the audio for WhisperX
         try:
             audio = whisperx.load_audio(temp_path)
-        except Exception as audio_load_error:
-            print(f"WhisperX load_audio failed: {str(audio_load_error)}")
+        except Exception as e:
+            logger.warning(f"WhisperX load_audio failed: {e}")
             # Fall back to manual loading
             import soundfile as sf
             audio, sr = sf.read(temp_path)
@@ -302,59 +330,55 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
                 from scipy import signal
                 audio = signal.resample(audio, int(len(audio) * 16000 / sr))
         
-        # Transcribe with error handling for CUDA issues
+        # Transcribe with error handling
         try:
-            # Try with original device
-            result = asr_model.transcribe(audio, batch_size=8)
-        except RuntimeError as cuda_error:
-            if "CUDA" in str(cuda_error) or "libcudnn" in str(cuda_error):
-                print(f"CUDA error in transcription, falling back to CPU: {str(cuda_error)}")
-                
-                # Try to load a CPU model as fallback
+            result = asr_model.transcribe(audio, batch_size=4)
+        except RuntimeError as e:
+            if "CUDA" in str(e) or "libcudnn" in str(e):
+                logger.warning(f"CUDA error in transcription, falling back to CPU: {e}")
                 try:
-                    # Move model to CPU and try again
-                    asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
-                    result = asr_model.transcribe(audio, batch_size=1)
-                except Exception as e:
-                    print(f"CPU fallback also failed: {str(e)}")
+                    # Try CPU model
+                    cpu_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
+                    result = cpu_model.transcribe(audio, batch_size=1)
+                    # Update the global model if the original one is broken
+                    asr_model = cpu_model
+                except Exception as cpu_e:
+                    logger.error(f"CPU fallback failed: {cpu_e}")
                     return "I'm having trouble processing audio right now."
             else:
-                # Re-raise if it's not a CUDA error
                 raise
+        finally:
+            # Clean up
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
         
-        # Clean up
+        # Extract text from segments
+        if result["segments"] and len(result["segments"]) > 0:
+            transcription = " ".join([segment["text"] for segment in result["segments"]])
+            logger.info(f"Transcription: '{transcription.strip()}'")
+            return transcription.strip()
+        
+        return ""
+    except Exception as e:
+        logger.error(f"Error in transcription: {e}")
         if os.path.exists(temp_path):
             os.remove(temp_path)
-        
-        # Get the transcription text
-        if result["segments"] and len(result["segments"]) > 0:
-            # Combine all segments
-            transcription = " ".join([segment["text"] for segment in result["segments"]])
-            print(f"Transcription successful: '{transcription.strip()}'")
-            return transcription.strip()
-        else:
-            print("Transcription returned no segments")
-            return ""
-    except Exception as e:
-        print(f"Error in transcription: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        if os.path.exists("temp_audio.wav"):
-            os.remove("temp_audio.wav")
         return "I heard something but couldn't understand it."
 
-
 def generate_response(text: str, conversation_history: List[Segment]) -> str:
     """Generate a contextual response based on the transcribed text"""
-    # Simple response logic - can be replaced with a more sophisticated LLM in the future
+    # Simple response logic - can be replaced with a more sophisticated LLM
     responses = {
-        "hello": "Hello there! How are you doing today?",
+        "hello": "Hello there! How can I help you today?",
+        "hi": "Hi there! What can I do for you?",
         "how are you": "I'm doing well, thanks for asking! How about you?",
         "what is your name": "I'm Sesame, your voice assistant. How can I help you?",
+        "who are you": "I'm Sesame, an AI voice assistant. I'm here to chat with you!",
         "bye": "Goodbye! It was nice chatting with you.",
         "thank you": "You're welcome! Is there anything else I can help with?",
         "weather": "I don't have real-time weather data, but I hope it's nice where you are!",
         "help": "I can chat with you using natural voice. Just speak normally and I'll respond.",
+        "what can you do": "I can have a conversation with you, answer questions, and provide assistance with various topics.",
     }
     
     text_lower = text.lower()
@@ -372,7 +396,7 @@ def generate_response(text: str, conversation_history: List[Segment]) -> str:
     else:
         return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?"
 
-# Flask routes for serving static content
+# Flask Routes
 @app.route('/')
 def index():
     return send_from_directory(base_dir, 'index.html')
@@ -391,11 +415,11 @@ def voice_chat_js():
 def serve_static(path):
     return send_from_directory(static_dir, path)
 
-# Socket.IO event handlers
+# Socket.IO Event Handlers
 @socketio.on('connect')
 def handle_connect():
     client_id = request.sid
-    print(f"Client connected: {client_id}")
+    logger.info(f"Client connected: {client_id}")
     
     # Initialize client context
     active_clients[client_id] = {
@@ -414,7 +438,7 @@ def handle_disconnect():
     client_id = request.sid
     if client_id in active_clients:
         del active_clients[client_id]
-    print(f"Client disconnected: {client_id}")
+    logger.info(f"Client disconnected: {client_id}")
 
 @socketio.on('generate')
 def handle_generate(data):
@@ -427,7 +451,7 @@ def handle_generate(data):
         text = data.get('text', '')
         speaker_id = data.get('speaker', 0)
         
-        print(f"Generating audio for: '{text}' with speaker {speaker_id}")
+        logger.info(f"Generating audio for: '{text}' with speaker {speaker_id}")
         
         # Generate audio response
         audio_tensor = generator.generate(
@@ -446,11 +470,12 @@ def handle_generate(data):
         audio_base64 = encode_audio_data(audio_tensor)
         emit('audio_response', {
             'type': 'audio_response',
-            'audio': audio_base64
+            'audio': audio_base64,
+            'text': text
         })
         
     except Exception as e:
-        print(f"Error generating audio: {str(e)}")
+        logger.error(f"Error generating audio: {e}")
         emit('error', {
             'type': 'error',
             'message': f"Error generating audio: {str(e)}"
@@ -482,7 +507,7 @@ def handle_add_to_context(data):
         })
         
     except Exception as e:
-        print(f"Error adding to context: {str(e)}")
+        logger.error(f"Error adding to context: {e}")
         emit('error', {
             'type': 'error',
             'message': f"Error processing audio: {str(e)}"
@@ -512,6 +537,11 @@ def handle_stream_audio(data):
         speaker_id = data.get('speaker', 0)
         audio_data = data.get('audio', '')
         
+        # Skip if no audio data (might be just a connection test)
+        if not audio_data:
+            logger.debug("Empty audio data received, ignoring")
+            return
+        
         # Convert received audio to tensor
         audio_chunk = decode_audio_data(audio_data)
         
@@ -522,7 +552,7 @@ def handle_stream_audio(data):
             client['energy_window'].clear()
             client['is_silence'] = False
             client['last_active_time'] = time.time()
-            print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}")
+            logger.info(f"[{client_id[:8]}] Streaming started with speaker ID: {speaker_id}")
             emit('streaming_status', {
                 'type': 'streaming_status',
                 'status': 'started'
@@ -553,52 +583,74 @@ def handle_stream_audio(data):
         
         if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0:
             # User has stopped talking - process the collected audio
-            print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence")
+            logger.info(f"[{client_id[:8]}] Processing audio after {silence_elapsed:.2f}s of silence")
+            process_complete_utterance(client_id, client, speaker_id)
+        
+        # If buffer gets too large without silence, process it anyway
+        elif len(client['streaming_buffer']) >= MAX_BUFFER_SIZE:
+            logger.info(f"[{client_id[:8]}] Processing long audio segment without silence")
+            process_complete_utterance(client_id, client, speaker_id, is_incomplete=True)
             
-            full_audio = torch.cat(client['streaming_buffer'], dim=0)
+            # Keep half of the buffer for context (sliding window approach)
+            half_point = len(client['streaming_buffer']) // 2
+            client['streaming_buffer'] = client['streaming_buffer'][half_point:]
             
-            # Process with WhisperX speech-to-text
-            print(f"[{client_id}] Starting transcription with WhisperX...")
-            transcribed_text = transcribe_audio(full_audio)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        logger.error(f"Error processing streaming audio: {e}")
+        emit('error', {
+            'type': 'error',
+            'message': f"Error processing streaming audio: {str(e)}"
+        })
+
+def process_complete_utterance(client_id, client, speaker_id, is_incomplete=False):
+    """Process a complete utterance (after silence or buffer limit)"""
+    try:
+        # Combine audio chunks
+        full_audio = torch.cat(client['streaming_buffer'], dim=0)
+        
+        # Process with speech-to-text
+        logger.info(f"[{client_id[:8]}] Starting transcription...")
+        transcribed_text = transcribe_audio(full_audio)
+        
+        # Add suffix for incomplete utterances
+        if is_incomplete:
+            transcribed_text += " (processing continued speech...)"
+        
+        # Log the transcription
+        logger.info(f"[{client_id[:8]}] Transcribed: '{transcribed_text}'")
+        
+        # Handle the transcription result
+        if transcribed_text:
+            # Add user message to context
+            user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
+            client['context_segments'].append(user_segment)
             
-            # Log the transcription
-            print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
+            # Send the transcribed text to client
+            emit('transcription', {
+                'type': 'transcription',
+                'text': transcribed_text
+            }, room=client_id)
             
-            # Handle the transcription result
-            if transcribed_text:
-                # Add user message to context
-                user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
-                client['context_segments'].append(user_segment)
-                
-                # Send the transcribed text to client
-                emit('transcription', {
-                    'type': 'transcription',
-                    'text': transcribed_text
-                })
-                
+            # Only generate a response if this is a complete utterance
+            if not is_incomplete:
                 # Generate a contextual response
                 response_text = generate_response(transcribed_text, client['context_segments'])
-                print(f"[{client_id}] Generating audio response: '{response_text}'")
+                logger.info(f"[{client_id[:8]}] Generating response: '{response_text}'")
                 
                 # Let the client know we're processing
                 emit('processing_status', {
                     'type': 'processing_status',
                     'status': 'generating_audio',
                     'message': 'Generating audio response...'
-                })
+                }, room=client_id)
                 
                 # Generate audio for the response
                 try:
                     # Use a different speaker than the user
                     ai_speaker_id = 1 if speaker_id == 0 else 0
                     
-                    # Start audio generation with streaming (chunk by chunk)
-                    audio_chunks = []
-                    
-                    # This version tries to stream the audio generation in smaller chunks
-                    # Note: CSM model doesn't natively support incremental generation,
-                    # so we're simulating it here for a more responsive UI experience
-                    
                     # Generate the full response
                     audio_tensor = generator.generate(
                         text=response_text,
@@ -621,60 +673,37 @@ def handle_stream_audio(data):
                         'type': 'audio_response',
                         'text': response_text,
                         'audio': audio_base64
-                    })
+                    }, room=client_id)
                     
-                    print(f"[{client_id}] Audio response sent: {len(audio_base64)} bytes")
+                    logger.info(f"[{client_id[:8]}] Audio response sent")
                     
-                except Exception as gen_error:
-                    print(f"Error generating audio response: {str(gen_error)}")
+                except Exception as e:
+                    logger.error(f"Error generating audio response: {e}")
                     emit('error', {
                         'type': 'error',
                         'message': "Sorry, there was an error generating the audio response."
-                    })
-            else:
-                # If transcription failed, send a generic response
-                emit('error', {
-                    'type': 'error',
-                    'message': "Sorry, I couldn't understand what you said. Could you try again?"
-                })
-            
-            # Clear buffer and reset silence detection
+                    }, room=client_id)
+        else:
+            # If transcription failed, send a notification
+            emit('error', {
+                'type': 'error',
+                'message': "Sorry, I couldn't understand what you said. Could you try again?"
+            }, room=client_id)
+        
+        # Only clear buffer for complete utterances
+        if not is_incomplete:
+            # Reset state
             client['streaming_buffer'] = []
             client['energy_window'].clear()
             client['is_silence'] = False
             client['last_active_time'] = time.time()
-        
-        # If buffer gets too large without silence, process it anyway
-        elif len(client['streaming_buffer']) >= 30:  # ~6 seconds of audio at 5 chunks/sec
-            print(f"[{client_id}] Processing long audio segment without silence")
-            full_audio = torch.cat(client['streaming_buffer'], dim=0)
-            
-            # Process with WhisperX speech-to-text
-            transcribed_text = transcribe_audio(full_audio)
-            
-            if transcribed_text:
-                client['context_segments'].append(
-                    Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
-                )
-                
-                # Send the transcribed text to client
-                emit('transcription', {
-                    'type': 'transcription',
-                    'text': transcribed_text + " (processing continued speech...)"
-                })
-            
-            # Keep half of the buffer for context (sliding window approach)
-            half_point = len(client['streaming_buffer']) // 2
-            client['streaming_buffer'] = client['streaming_buffer'][half_point:]
             
     except Exception as e:
-        import traceback
-        traceback.print_exc()
-        print(f"Error processing streaming audio: {str(e)}")
+        logger.error(f"Error processing utterance: {e}")
         emit('error', {
             'type': 'error',
-            'message': f"Error processing streaming audio: {str(e)}"
-        })
+            'message': f"Error processing audio: {str(e)}"
+        }, room=client_id)
 
 @socketio.on('stop_streaming')
 def handle_stop_streaming(data):
@@ -687,21 +716,8 @@ def handle_stop_streaming(data):
     
     if client['streaming_buffer'] and len(client['streaming_buffer']) > 5:
         # Process any remaining audio in the buffer
-        full_audio = torch.cat(client['streaming_buffer'], dim=0)
-        
-        # Process with WhisperX speech-to-text
-        transcribed_text = transcribe_audio(full_audio)
-        
-        if transcribed_text:
-            client['context_segments'].append(
-                Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio)
-            )
-            
-            # Send the transcribed text to client
-            emit('transcription', {
-                'type': 'transcription',
-                'text': transcribed_text
-            })
+        logger.info(f"[{client_id[:8]}] Processing final audio buffer on stop")
+        process_complete_utterance(client_id, client, data.get("speaker", 0))
     
     client['streaming_buffer'] = []
     emit('streaming_status', {
@@ -709,18 +725,18 @@ def handle_stop_streaming(data):
         'status': 'stopped'
     })
 
-def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=500):
+def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=CHUNK_SIZE_MS):
     """Stream audio to client in chunks to simulate real-time generation"""
     try:
         if client_id not in active_clients:
-            print(f"Client {client_id} not found for streaming")
+            logger.warning(f"Client {client_id} not found for streaming")
             return
             
         # Calculate chunk size in samples
         chunk_size = int(generator.sample_rate * chunk_size_ms / 1000)
         total_chunks = math.ceil(audio_tensor.size(0) / chunk_size)
         
-        print(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each")
+        logger.info(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each")
         
         # Send initial response with text but no audio yet
         socketio.emit('audio_response_start', {
@@ -758,29 +774,24 @@ def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size
             'text': text
         }, room=client_id)
         
-        print(f"Audio streaming complete: {total_chunks} chunks sent")
+        logger.info(f"Audio streaming complete: {total_chunks} chunks sent")
         
     except Exception as e:
-        print(f"Error streaming audio to client: {str(e)}")
+        logger.error(f"Error streaming audio to client: {e}")
         import traceback
         traceback.print_exc()
 
+# Main server start
 if __name__ == "__main__":
     print(f"\n{'='*60}")
-    print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
+    print(f"🔊 Sesame AI Voice Chat Server")
     print(f"{'='*60}")
     print(f"📡 Server Information:")
     print(f"   - Local URL: http://localhost:5000")
     print(f"   - Network URL: http://<your-ip-address>:5000")
-    print(f"   - WebSocket: ws://<your-ip-address>:5000/socket.io")
-    print(f"{'='*60}")
-    print(f"💡 To make this server public:")
-    print(f"   1. Ensure port 5000 is open in your firewall")
-    print(f"   2. Set up port forwarding on your router to port 5000")
-    print(f"   3. Or use a service like ngrok with: ngrok http 5000")
     print(f"{'='*60}")
     print(f"🌐 Device: {device.upper()}")
-    print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})")
+    print(f"🧠 Models: Sesame CSM + WhisperX ASR")
     print(f"🔧 Serving from: {os.path.join(base_dir, 'index.html')}")
     print(f"{'='*60}")
     print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")

From 8592257cdc5c073bdf88b2dd0311ad0c2af0b957 Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sun, 30 Mar 2025 00:33:14 -0400
Subject: [PATCH 16/16] Demo Update 7

---
 Backend/server.py | 254 ++++++++++++++--------------------------------
 1 file changed, 77 insertions(+), 177 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index 8ba56b4..8f4e278 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -8,7 +8,6 @@ import logging
 import numpy as np
 import torch
 import torchaudio
-import whisperx
 from io import BytesIO
 from typing import List, Dict, Any, Optional
 from flask import Flask, request, send_from_directory, Response
@@ -25,68 +24,24 @@ logging.basicConfig(
 )
 logger = logging.getLogger("sesame-server")
 
-# CUDA Environment Setup
-def setup_cuda_environment():
-    """Set up CUDA environment with proper error handling"""
-    # Search for CUDA libraries in common locations
-    cuda_lib_dirs = [
-        "/usr/local/cuda/lib64",
-        "/usr/lib/x86_64-linux-gnu",
-        "/usr/local/cuda/extras/CUPTI/lib64"
-    ]
-    
-    # Add directories to LD_LIBRARY_PATH if they exist
-    current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
-    for cuda_dir in cuda_lib_dirs:
-        if os.path.exists(cuda_dir) and cuda_dir not in current_ld_path:
-            if current_ld_path:
-                os.environ['LD_LIBRARY_PATH'] = f"{current_ld_path}:{cuda_dir}"
-            else:
-                os.environ['LD_LIBRARY_PATH'] = cuda_dir
-            current_ld_path = os.environ['LD_LIBRARY_PATH']
-    
-    logger.info(f"LD_LIBRARY_PATH set to: {os.environ.get('LD_LIBRARY_PATH', 'not set')}")
-    
-    # Determine best compute device
-    device = "cpu"
-    compute_type = "int8"
-    
+# Determine best compute device
+if torch.backends.mps.is_available():
+    device = "mps"
+elif torch.cuda.is_available():
     try:
-        # Set CUDA preferences
-        os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Limit to first GPU only
-        
-        # Try enabling TF32 precision if available
-        try:
-            torch.backends.cuda.matmul.allow_tf32 = True
-            torch.backends.cudnn.allow_tf32 = True
-            torch.backends.cudnn.enabled = True
-            torch.backends.cudnn.benchmark = True
-        except Exception as e:
-            logger.warning(f"Could not set advanced CUDA options: {e}")
-        
-        # Test if CUDA is functional
-        if torch.cuda.is_available():
-            try:
-                # Test basic CUDA operations
-                x = torch.rand(10, device="cuda")
-                y = x + x
-                del x, y
-                torch.cuda.empty_cache()
-                device = "cuda"
-                compute_type = "float16"
-                logger.info("CUDA is fully functional")
-            except Exception as e:
-                logger.warning(f"CUDA available but not working correctly: {e}")
-                device = "cpu"
-        else:
-            logger.info("CUDA is not available, using CPU")
+        # Test CUDA functionality
+        torch.rand(10, device="cuda")
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cudnn.benchmark = True
+        device = "cuda"
+        logger.info("CUDA is fully functional")
     except Exception as e:
-        logger.error(f"Error setting up computing environment: {e}")
-    
-    return device, compute_type
-
-# Set up the compute environment
-device, compute_type = setup_cuda_environment()
+        logger.warning(f"CUDA available but not working correctly: {e}")
+        device = "cpu"
+else:
+    device = "cpu"
+    logger.info("Using CPU")
 
 # Constants and Configuration
 SILENCE_THRESHOLD = 0.01
@@ -99,9 +54,37 @@ base_dir = os.path.dirname(os.path.abspath(__file__))
 static_dir = os.path.join(base_dir, "static")
 os.makedirs(static_dir, exist_ok=True)
 
+# Define a simple energy-based speech detector
+class SpeechDetector:
+    def __init__(self):
+        self.min_speech_energy = 0.01
+        self.speech_window = 0.2  # seconds
+    
+    def detect_speech(self, audio_tensor, sample_rate):
+        # Calculate frame size based on window size
+        frame_size = int(sample_rate * self.speech_window)
+        
+        # If audio is shorter than frame size, use the entire audio
+        if audio_tensor.shape[0] < frame_size:
+            frames = [audio_tensor]
+        else:
+            # Split audio into frames
+            frames = [audio_tensor[i:i+frame_size] for i in range(0, len(audio_tensor), frame_size)]
+        
+        # Calculate energy per frame
+        energies = [torch.mean(frame**2).item() for frame in frames]
+        
+        # Determine if there's speech based on energy threshold
+        has_speech = any(e > self.min_speech_energy for e in energies)
+        
+        return has_speech
+
+speech_detector = SpeechDetector()
+logger.info("Initialized simple speech detector")
+
 # Model Loading Functions
 def load_speech_models():
-    """Load all required speech models with fallbacks"""
+    """Load speech generation model"""
     # Load speech generation model (Sesame CSM)
     try:
         logger.info(f"Loading Sesame CSM model on {device}...")
@@ -120,52 +103,10 @@ def load_speech_models():
         else:
             raise RuntimeError("Failed to load speech synthesis model on any device")
     
-    # Load ASR model (WhisperX)
-    try:
-        logger.info("Loading WhisperX model...")
-        # Start with the tiny model on CPU for reliable initialization
-        asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
-        logger.info("WhisperX 'tiny' model loaded on CPU successfully")
-        
-        # Try upgrading to GPU if available
-        if device == "cuda":
-            try:
-                logger.info("Trying to load WhisperX on CUDA...")
-                # Test with a tiny model first
-                test_audio = torch.zeros(16000)  # 1 second of silence
-                
-                cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16")
-                # Test the model with real inference
-                _ = cuda_model.transcribe(test_audio.numpy(), batch_size=1)
-                asr_model = cuda_model
-                logger.info("WhisperX model running on CUDA successfully")
-                
-                # Try to upgrade to small model
-                try:
-                    small_model = whisperx.load_model("small", "cuda", compute_type="float16")
-                    _ = small_model.transcribe(test_audio.numpy(), batch_size=1)
-                    asr_model = small_model
-                    logger.info("WhisperX 'small' model loaded on CUDA successfully")
-                except Exception as e:
-                    logger.warning(f"Staying with 'tiny' model on CUDA: {e}")
-            except Exception as e:
-                logger.warning(f"CUDA loading failed, staying with CPU model: {e}")
-    except Exception as e:
-        logger.error(f"Error loading WhisperX model: {e}")
-        # Create a minimal dummy model as last resort
-        class DummyModel:
-            def __init__(self):
-                self.device = "cpu"
-            def transcribe(self, *args, **kwargs):
-                return {"segments": [{"text": "Speech recognition currently unavailable."}]}
-        
-        asr_model = DummyModel()
-        logger.warning("Using dummy transcription model - ASR functionality limited")
-    
-    return generator, asr_model
+    return generator
 
-# Load speech models
-generator, asr_model = load_speech_models()
+# Load speech model
+generator = load_speech_models()
 
 # Set up Flask and Socket.IO
 app = Flask(__name__)
@@ -307,63 +248,23 @@ def encode_audio_data(audio_tensor: torch.Tensor) -> str:
         buf.seek(0)
         return f"data:audio/wav;base64,{base64.b64encode(buf.read()).decode('utf-8')}"
 
-def transcribe_audio(audio_tensor: torch.Tensor) -> str:
-    """Transcribe audio using WhisperX with robust error handling"""
-    global asr_model
+def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str:
+    """Process speech and return a simple response"""
+    # In this simplified version, we'll just check if there's sound
+    # and provide basic responses instead of doing actual speech recognition
     
-    try:
-        # Save the tensor to a temporary file
-        temp_path = os.path.join(base_dir, f"temp_audio_{time.time()}.wav")
-        torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
+    if speech_detector and speech_detector.detect_speech(audio_tensor, generator.sample_rate):
+        # Generate a response based on audio energy
+        energy = torch.mean(torch.abs(audio_tensor)).item()
         
-        logger.info(f"Transcribing audio file: {os.path.getsize(temp_path)} bytes")
-        
-        # Load the audio for WhisperX
-        try:
-            audio = whisperx.load_audio(temp_path)
-        except Exception as e:
-            logger.warning(f"WhisperX load_audio failed: {e}")
-            # Fall back to manual loading
-            import soundfile as sf
-            audio, sr = sf.read(temp_path)
-            if sr != 16000:  # WhisperX expects 16kHz audio
-                from scipy import signal
-                audio = signal.resample(audio, int(len(audio) * 16000 / sr))
-        
-        # Transcribe with error handling
-        try:
-            result = asr_model.transcribe(audio, batch_size=4)
-        except RuntimeError as e:
-            if "CUDA" in str(e) or "libcudnn" in str(e):
-                logger.warning(f"CUDA error in transcription, falling back to CPU: {e}")
-                try:
-                    # Try CPU model
-                    cpu_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
-                    result = cpu_model.transcribe(audio, batch_size=1)
-                    # Update the global model if the original one is broken
-                    asr_model = cpu_model
-                except Exception as cpu_e:
-                    logger.error(f"CPU fallback failed: {cpu_e}")
-                    return "I'm having trouble processing audio right now."
-            else:
-                raise
-        finally:
-            # Clean up
-            if os.path.exists(temp_path):
-                os.remove(temp_path)
-        
-        # Extract text from segments
-        if result["segments"] and len(result["segments"]) > 0:
-            transcription = " ".join([segment["text"] for segment in result["segments"]])
-            logger.info(f"Transcription: '{transcription.strip()}'")
-            return transcription.strip()
-        
-        return ""
-    except Exception as e:
-        logger.error(f"Error in transcription: {e}")
-        if os.path.exists(temp_path):
-            os.remove(temp_path)
-        return "I heard something but couldn't understand it."
+        if energy > 0.1:  # Louder speech
+            return "I heard you speaking clearly. How can I help you today?"
+        elif energy > 0.05:  # Moderate speech
+            return "I heard you say something. Could you please repeat that?"
+        else:  # Soft speech
+            return "I detected some speech, but it was quite soft. Could you speak up a bit?"
+    else:
+        return "I didn't detect any speech. Could you please try again?"
 
 def generate_response(text: str, conversation_history: List[Segment]) -> str:
     """Generate a contextual response based on the transcribed text"""
@@ -394,7 +295,7 @@ def generate_response(text: str, conversation_history: List[Segment]) -> str:
     elif len(text) < 10:
         return "Thanks for your message. Could you elaborate a bit more?"
     else:
-        return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?"
+        return f"I heard you speaking. That's interesting! Can you tell me more about that?"
 
 # Flask Routes
 @app.route('/')
@@ -610,33 +511,32 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
         # Combine audio chunks
         full_audio = torch.cat(client['streaming_buffer'], dim=0)
         
-        # Process with speech-to-text
-        logger.info(f"[{client_id[:8]}] Starting transcription...")
-        transcribed_text = transcribe_audio(full_audio)
+        # Process audio to generate a response (no speech recognition)
+        generated_text = process_speech(full_audio, client_id)
         
         # Add suffix for incomplete utterances
         if is_incomplete:
-            transcribed_text += " (processing continued speech...)"
+            generated_text += " (processing continued speech...)"
         
-        # Log the transcription
-        logger.info(f"[{client_id[:8]}] Transcribed: '{transcribed_text}'")
+        # Log the generated text
+        logger.info(f"[{client_id[:8]}] Generated text: '{generated_text}'")
         
-        # Handle the transcription result
-        if transcribed_text:
+        # Handle the result
+        if generated_text:
             # Add user message to context
-            user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
+            user_segment = Segment(text=generated_text, speaker=speaker_id, audio=full_audio)
             client['context_segments'].append(user_segment)
             
-            # Send the transcribed text to client
+            # Send the text to client
             emit('transcription', {
                 'type': 'transcription',
-                'text': transcribed_text
+                'text': generated_text
             }, room=client_id)
             
             # Only generate a response if this is a complete utterance
             if not is_incomplete:
                 # Generate a contextual response
-                response_text = generate_response(transcribed_text, client['context_segments'])
+                response_text = generate_response(generated_text, client['context_segments'])
                 logger.info(f"[{client_id[:8]}] Generating response: '{response_text}'")
                 
                 # Let the client know we're processing
@@ -684,7 +584,7 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
                         'message': "Sorry, there was an error generating the audio response."
                     }, room=client_id)
         else:
-            # If transcription failed, send a notification
+            # If processing failed, send a notification
             emit('error', {
                 'type': 'error',
                 'message': "Sorry, I couldn't understand what you said. Could you try again?"
@@ -791,7 +691,7 @@ if __name__ == "__main__":
     print(f"   - Network URL: http://<your-ip-address>:5000")
     print(f"{'='*60}")
     print(f"🌐 Device: {device.upper()}")
-    print(f"🧠 Models: Sesame CSM + WhisperX ASR")
+    print(f"🧠 Models: Sesame CSM (TTS only)")
     print(f"🔧 Serving from: {os.path.join(base_dir, 'index.html')}")
     print(f"{'='*60}")
     print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")