Server and Client Side update

2025-03-29 22:48:24 -04:00
parent e1f976eaca
commit 08fec9c403
2 changed files with 655 additions and 607 deletions
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -1,9 +1,13 @@
+/Backend/index.html -->
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sesame AI Voice Chat</title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
+    <!-- Socket.IO client library -->
+    <script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
    <style>
        body {
            font-family: 'Arial', sans-serif;
@@ -11,6 +15,12 @@
            margin: 0 auto;
            padding: 20px;
            background-color: #f9f9f9;
+            color: #333;
+        }
+        h1 {
+            text-align: center;
+            margin-bottom: 20px;
+            color: #1a73e8;
        }
        .conversation {
            border: 1px solid #ddd;
@@ -21,6 +31,7 @@
            margin-bottom: 20px;
            background-color: white;
            box-shadow: 0 2px 10px rgba(0,0,0,0.05);
+            scroll-behavior: smooth;
        }
        .message {
            margin-bottom: 15px;
@@ -28,6 +39,7 @@
            border-radius: 12px;
            max-width: 80%;
            line-height: 1.4;
+            animation: message-appear 0.3s ease-out;
        }
        .user {
            background-color: #e3f2fd;
@@ -55,6 +67,7 @@
            gap: 15px;
            justify-content: center;
            align-items: center;
+            margin-bottom: 15px;
        }
        button {
            padding: 12px 24px;
@@ -66,11 +79,20 @@
            font-weight: bold;
            transition: all 0.2s ease;
            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            gap: 8px;
        }
        button:hover {
            background-color: #45a049;
            box-shadow: 0 4px 8px rgba(0,0,0,0.15);
        }
+        button:disabled {
+            background-color: #cccccc;
+            cursor: not-allowed;
+            opacity: 0.7;
+        }
        .recording {
            background-color: #f44336;
            animation: pulse 1.5s infinite;
@@ -94,6 +116,10 @@
            50% { opacity: 0.7; }
            100% { opacity: 1; }
        }
+        @keyframes message-appear {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
        .status-indicator {
            display: flex;
            align-items: center;
@@ -106,6 +132,7 @@
            height: 10px;
            border-radius: 50%;
            background-color: #ccc;
+            transition: background-color 0.3s ease;
        }
        .status-dot.active {
            background-color: #4CAF50;
@@ -117,6 +144,7 @@
        audio {
            width: 100%;
            margin-top: 5px;
+            border-radius: 8px;
        }
        .visualizer-container {
            width: 100%;
@@ -126,14 +154,13 @@
            margin-bottom: 15px;
            overflow: hidden;
            position: relative;
+            box-shadow: inset 0 1px 3px rgba(0,0,0,0.1);
        }
-        
        .audio-visualizer {
            width: 100%;
            height: 100%;
            display: block;
        }
-        
        .visualizer-label {
            position: absolute;
            top: 50%;
@@ -145,6 +172,21 @@
            opacity: 0.7;
            text-align: center;
            width: 100%;
+            transition: opacity 0.3s ease;
+        }
+        .conversation::-webkit-scrollbar {
+            width: 8px;
+        }
+        .conversation::-webkit-scrollbar-track {
+            background: #f1f1f1;
+            border-radius: 10px;
+        }
+        .conversation::-webkit-scrollbar-thumb {
+            background: #ccc;
+            border-radius: 10px;
+        }
+        .conversation::-webkit-scrollbar-thumb:hover {
+            background: #aaa;
        }
    </style>
 </head>
@@ -162,8 +204,8 @@
            <option value="0">Speaker 0</option>
            <option value="1">Speaker 1</option>
        </select>
-        <button id="streamButton">Start Conversation</button>
-        <button id="clearButton">Clear Chat</button>
+        <button id="streamButton"><i class="fas fa-microphone"></i> Start Conversation</button>
+        <button id="clearButton"><i class="fas fa-trash"></i> Clear Chat</button>
    </div>
    
    <div class="status-indicator">
@@ -173,7 +215,7 @@

    <script>
        // Variables
-        let ws;
+        let socket;
        let audioContext;
        let streamProcessor;
        let isStreaming = false;
@@ -184,14 +226,13 @@
        const CLIENT_SILENCE_THRESHOLD = 0.01;
        const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second
        
-        // Add these variables with your existing ones
+        // Visualizer variables
        let analyser;
        let visualizerCanvas;
        let canvasContext;
        let visualizerBufferLength;
        let visualizerDataArray;
        let visualizerAnimationFrame;
-        const visualizerLabel = document.getElementById('visualizerLabel');
        
        // DOM elements
        const conversationEl = document.getElementById('conversation');
@@ -200,93 +241,150 @@
        const clearButton = document.getElementById('clearButton');
        const statusDot = document.getElementById('statusDot');
        const statusText = document.getElementById('statusText');
+        const visualizerLabel = document.getElementById('visualizerLabel');
        
        // Initialize on page load
        window.addEventListener('load', () => {
-            connectWebSocket();
+            // Initialize audio context
            setupAudioContext();
+            
+            // Setup visualization 
            setupVisualizer();
            
-            // Event listeners
+            // Connect to Socket.IO server
+            connectSocketIO();
+            
+            // Add event listeners
            streamButton.addEventListener('click', toggleStreaming);
            clearButton.addEventListener('click', clearConversation);
        });
        
-        // Setup audio context for streaming
+        // Setup audio context
        function setupAudioContext() {
            try {
                audioContext = new (window.AudioContext || window.webkitAudioContext)();
-                console.log('Audio context setup completed');
+                console.log('Audio context initialized');
            } catch (err) {
                console.error('Error setting up audio context:', err);
                addSystemMessage(`Audio context error: ${err.message}`);
+                streamButton.disabled = true;
            }
        }
        
-        // Connect to WebSocket server
-        function connectWebSocket() {
-            const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-            const wsUrl = `${wsProtocol}//${window.location.hostname}:8000/ws`;
+        // Setup the audio visualizer
+        function setupVisualizer() {
+            visualizerCanvas = document.getElementById('audioVisualizer');
+            canvasContext = visualizerCanvas.getContext('2d');
            
-            ws = new WebSocket(wsUrl);
+            // Set canvas size to match container
+            function resizeCanvas() {
+                const container = visualizerCanvas.parentElement;
+                visualizerCanvas.width = container.clientWidth;
+                visualizerCanvas.height = container.clientHeight;
+            }
            
-            ws.onopen = () => {
-                console.log('WebSocket connected');
+            // Call initially and on window resize
+            resizeCanvas();
+            window.addEventListener('resize', resizeCanvas);
+            
+            // Create placeholder data array
+            visualizerBufferLength = 128;
+            visualizerDataArray = new Uint8Array(visualizerBufferLength);
+        }
+        
+        // Connect to Socket.IO server
+        function connectSocketIO() {
+            // Use the server URL with or without a specific port
+            const serverUrl = window.location.origin;
+            
+            console.log(`Connecting to Socket.IO server at ${serverUrl}`);
+            socket = io(serverUrl, {
+                reconnectionDelay: 1000,
+                reconnectionDelayMax: 5000,
+                reconnectionAttempts: Infinity
+            });
+            
+            // Socket.IO event handlers
+            socket.on('connect', () => {
+                console.log('Connected to Socket.IO server');
                statusDot.classList.add('active');
                statusText.textContent = 'Connected';
                addSystemMessage('Connected to server');
-            };
+                streamButton.disabled = false;
+            });
            
-            ws.onmessage = (event) => {
-                const response = JSON.parse(event.data);
-                console.log('Received:', response);
-                
-                if (response.type === 'audio_response') {
-                    // Play audio response
-                    const audio = new Audio(response.audio);
-                    audio.play();
-                    
-                    // Add message to conversation
-                    addAIMessage(response.text || 'AI response', response.audio);
-                    
-                    // Reset to speaking state after AI response
-                    if (isStreaming) {
-                        streamButton.textContent = 'Listening...';
-                        streamButton.style.backgroundColor = '#f44336'; // Back to red
-                        streamButton.classList.add('recording');
-                        isSpeaking = false; // Reset speaking state
-                    }
-                } else if (response.type === 'error') {
-                    addSystemMessage(`Error: ${response.message}`);
-                } else if (response.type === 'context_updated') {
-                    addSystemMessage(response.message);
-                } else if (response.type === 'streaming_status') {
-                    addSystemMessage(`Streaming ${response.status}`);
-                } else if (response.type === 'transcription') {
-                    addUserTranscription(response.text);
-                }
-            };
-            
-            ws.onclose = () => {
-                console.log('WebSocket disconnected');
+            socket.on('disconnect', () => {
+                console.log('Disconnected from Socket.IO server');
                statusDot.classList.remove('active');
                statusText.textContent = 'Disconnected';
-                addSystemMessage('Disconnected from server. Reconnecting...');
-                setTimeout(connectWebSocket, 3000);
-            };
+                addSystemMessage('Disconnected from server');
+                streamButton.disabled = true;
                
-            ws.onerror = (error) => {
-                console.error('WebSocket error:', error);
+                // Stop streaming if active
+                if (isStreaming) {
+                    stopStreaming(false); // false = don't send to server
+                }
+            });
+            
+            socket.on('status', (data) => {
+                console.log('Status update:', data);
+                addSystemMessage(data.message);
+            });
+            
+            socket.on('error', (data) => {
+                console.error('Server error:', data);
+                addSystemMessage(`Error: ${data.message}`);
+            });
+            
+            socket.on('audio_response', (data) => {
+                console.log('Received audio response');
+                
+                // Play audio response
+                const audio = new Audio(data.audio);
+                audio.play();
+                
+                // Add message to conversation
+                addAIMessage(data.text || 'AI response', data.audio);
+                
+                // Reset UI state after AI response
+                if (isStreaming) {
+                    streamButton.textContent = 'Listening...';
+                    streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
+                    streamButton.style.backgroundColor = '#f44336';
+                    streamButton.classList.add('recording');
+                    streamButton.classList.remove('processing');
+                    isSpeaking = false; // Reset speaking state
+                }
+            });
+            
+            socket.on('transcription', (data) => {
+                console.log('Received transcription:', data);
+                addUserTranscription(data.text);
+            });
+            
+            socket.on('context_updated', (data) => {
+                console.log('Context updated:', data);
+                addSystemMessage(data.message);
+            });
+            
+            socket.on('streaming_status', (data) => {
+                console.log('Streaming status:', data);
+                addSystemMessage(`Streaming ${data.status}`);
+            });
+            
+            socket.on('connect_error', (error) => {
+                console.error('Connection error:', error);
                statusDot.classList.remove('active');
-                statusText.textContent = 'Error';
-                addSystemMessage('Connection error');
-            };
+                statusText.textContent = 'Connection Error';
+                addSystemMessage('Failed to connect to server');
+                streamButton.disabled = true;
+            });
        }
        
        // Toggle streaming
        function toggleStreaming() {
            if (isStreaming) {
-                stopStreaming();
+                stopStreaming(true); // true = send to server
            } else {
                startStreaming();
            }
@@ -295,49 +393,52 @@
        // Start streaming
        async function startStreaming() {
            try {
+                // Request microphone access
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                const speaker = parseInt(speakerSelectEl.value);
                
+                // Update state
                isStreaming = true;
                isSpeaking = false;
                energyWindow = [];
                
-                streamButton.textContent = 'Listening...';
+                // Update UI
+                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
                streamButton.classList.add('recording');
                
-                // Create audio processor node
+                // Setup audio analysis
                const source = audioContext.createMediaStreamSource(stream);
                
-                // Set up analyser for visualization with better settings
+                // Setup analyzer for visualization
                analyser = audioContext.createAnalyser();
                analyser.fftSize = 256;
-                analyser.smoothingTimeConstant = 0.8; // Add smoothing for nicer visualization
+                analyser.smoothingTimeConstant = 0.8;
                analyser.minDecibels = -90;
                analyser.maxDecibels = -10;
                
                visualizerBufferLength = analyser.frequencyBinCount;
                visualizerDataArray = new Uint8Array(visualizerBufferLength);
                
-                // Connect source to analyzer first
+                // Connect source to analyzer
                source.connect(analyser);
                
-                // Hide the label when visualization is active
+                // Hide visualizer label
                visualizerLabel.style.opacity = '0';
                
-                // Start drawing the visualization
+                // Start visualization
                if (visualizerAnimationFrame) {
                    cancelAnimationFrame(visualizerAnimationFrame);
                }
                drawVisualizer();
                
-                // Set up processor for audio processing
+                // Setup audio processor
                streamProcessor = audioContext.createScriptProcessor(4096, 1, 1);
                
-                // Connect nodes
+                // Connect audio nodes
                source.connect(streamProcessor);
                streamProcessor.connect(audioContext.destination);
                
-                // Process and send audio data
+                // Process audio
                streamProcessor.onaudioprocess = function(e) {
                    const audioData = e.inputBuffer.getChannelData(0);
                    
@@ -349,10 +450,10 @@
                    const avgEnergy = calculateAverageEnergy();
                    const isSilent = avgEnergy < CLIENT_SILENCE_THRESHOLD;
                    
-                    // Handle silence/speech transitions for visual feedback
+                    // Handle silence/speech transitions
                    handleSpeechState(isSilent);
                    
-                    // Continue processing audio regardless of silence state
+                    // Process and send audio
                    const downsampled = downsampleBuffer(audioData, audioContext.sampleRate, 24000);
                    sendAudioChunk(downsampled, speaker);
                };
@@ -363,8 +464,71 @@
                console.error('Error starting audio stream:', err);
                addSystemMessage(`Microphone error: ${err.message}`);
                isStreaming = false;
-                streamButton.textContent = 'Start Conversation';
-                streamButton.classList.remove('recording');
+                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
+                streamButton.classList.remove('recording', 'processing');
+            }
+        }
+        
+        // Stop streaming
+        function stopStreaming(sendToServer = true) {
+            // Disconnect audio nodes
+            if (streamProcessor) {
+                streamProcessor.disconnect();
+                streamProcessor = null;
+            }
+            
+            if (analyser) {
+                analyser.disconnect();
+                analyser = null;
+            }
+            
+            // Stop visualization
+            if (visualizerAnimationFrame) {
+                cancelAnimationFrame(visualizerAnimationFrame);
+                visualizerAnimationFrame = null;
+            }
+            
+            // Clear canvas
+            if (canvasContext) {
+                canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+                visualizerLabel.style.opacity = '0.7';
+            }
+            
+            // Clear silence timer
+            if (silenceTimer) {
+                clearTimeout(silenceTimer);
+                silenceTimer = null;
+            }
+            
+            // Reset state
+            isStreaming = false;
+            isSpeaking = false;
+            energyWindow = [];
+            
+            // Update UI
+            streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
+            streamButton.classList.remove('recording', 'processing');
+            streamButton.style.backgroundColor = '';
+            
+            addSystemMessage('Conversation paused');
+            
+            // Notify server
+            if (sendToServer && socket.connected) {
+                socket.emit('stop_streaming', {
+                    speaker: parseInt(speakerSelectEl.value)
+                });
+            }
+        }
+        
+        // Clear conversation
+        function clearConversation() {
+            // Clear UI
+            conversationEl.innerHTML = '';
+            addSystemMessage('Conversation cleared');
+            
+            // Notify server
+            if (socket.connected) {
+                socket.emit('clear_context');
            }
        }
        
@@ -377,7 +541,7 @@
            return sum / buffer.length;
        }
        
-        // Update the sliding energy window
+        // Update energy window
        function updateEnergyWindow(energy) {
            energyWindow.push(energy);
            if (energyWindow.length > ENERGY_WINDOW_SIZE) {
@@ -385,20 +549,20 @@
            }
        }
        
-        // Calculate average energy from the window
+        // Calculate average energy
        function calculateAverageEnergy() {
            if (energyWindow.length === 0) return 0;
            return energyWindow.reduce((sum, val) => sum + val, 0) / energyWindow.length;
        }
        
-        // Handle speech state changes and visual feedback
+        // Handle speech state changes
        function handleSpeechState(isSilent) {
            if (isSpeaking && isSilent) {
                // Transition from speaking to silence
                if (!silenceTimer) {
                    silenceTimer = setTimeout(() => {
                        // Silence persisted long enough
-                        streamButton.textContent = 'Processing...';
+                        streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
                        streamButton.classList.remove('recording');
                        streamButton.classList.add('processing');
                        addSystemMessage('Detected pause in speech, processing response...');
@@ -407,24 +571,24 @@
            } else if (!isSpeaking && !isSilent) {
                // Transition from silence to speaking
                isSpeaking = true;
-                streamButton.textContent = 'Listening...';
+                streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
                streamButton.classList.add('recording');
                streamButton.classList.remove('processing');
                
-                // Clear any pending silence timer
+                // Clear silence timer
                if (silenceTimer) {
                    clearTimeout(silenceTimer);
                    silenceTimer = null;
                }
            } else if (isSpeaking && !isSilent) {
-                // Still speaking, reset any silence timer
+                // Still speaking, reset silence timer
                if (silenceTimer) {
                    clearTimeout(silenceTimer);
                    silenceTimer = null;
                }
            }
            
-            // Update speaking state
+            // Update speaking state for non-silent audio
            if (!isSilent) {
                isSpeaking = true;
            }
@@ -432,83 +596,93 @@
        
        // Send audio chunk to server
        function sendAudioChunk(audioData, speaker) {
+            if (!socket || !socket.connected) {
+                console.warn('Cannot send audio: socket not connected');
+                return;
+            }
+            
            const wavData = createWavBlob(audioData, 24000);
            const reader = new FileReader();
            
            reader.onloadend = function() {
                const base64data = reader.result;
                
-                // Send to server
-                ws.send(JSON.stringify({
-                    action: 'stream_audio',
+                // Send to server using Socket.IO
+                socket.emit('stream_audio', {
                    speaker: speaker,
                    audio: base64data
-                }));
+                });
            };
            
            reader.readAsDataURL(wavData);
        }
        
-        // Stop streaming
-        function stopStreaming() {
-            if (streamProcessor) {
-                streamProcessor.disconnect();
-                streamProcessor = null;
+        // Visualization function
+        function drawVisualizer() {
+            if (!canvasContext) {
+                console.error("Canvas context not available");
+                return;
            }
            
-            if (analyser) {
-                analyser.disconnect();
-                analyser = null;
+            visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
+            
+            // Get frequency data if available
+            if (isStreaming && analyser) {
+                try {
+                    analyser.getByteFrequencyData(visualizerDataArray);
+                } catch (e) {
+                    console.error("Error getting frequency data:", e);
+                }
+            } else {
+                // Fade out when not streaming
+                for (let i = 0; i < visualizerDataArray.length; i++) {
+                    visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
+                }
            }
            
-            // Stop the visualization
-            if (visualizerAnimationFrame) {
-                cancelAnimationFrame(visualizerAnimationFrame);
-                visualizerAnimationFrame = null;
+            // Clear canvas
+            canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
+            canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
+            
+            // Draw bars
+            const width = visualizerCanvas.width;
+            const height = visualizerCanvas.height;
+            const barCount = Math.min(visualizerBufferLength, 64);
+            const barWidth = width / barCount - 1;
+            
+            for (let i = 0; i < barCount; i++) {
+                const index = Math.floor(i * visualizerBufferLength / barCount);
+                const value = visualizerDataArray[index];
+                
+                const barHeight = (value / 255) * height;
+                const x = i * (barWidth + 1);
+                
+                // Color based on frequency
+                const hue = 200 + (i / barCount * 60);
+                const saturation = 90 - (value / 255 * 30);
+                const lightness = 40 + (value / 255 * 30);
+                
+                // Draw bar
+                canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
+                canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
+                
+                // Add reflection effect
+                const gradientHeight = Math.min(10, barHeight / 3);
+                const gradient = canvasContext.createLinearGradient(
+                    0, height - barHeight, 
+                    0, height - barHeight + gradientHeight
+                );
+                gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
+                gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
+                canvasContext.fillStyle = gradient;
+                canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
            }
            
-            // Clear the canvas
-            if (canvasContext) {
-                canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
-                visualizerLabel.style.opacity = '0.7';
-            }
-            
-            // Clear any pending silence timer
-            if (silenceTimer) {
-                clearTimeout(silenceTimer);
-                silenceTimer = null;
-            }
-            
-            isStreaming = false;
-            isSpeaking = false;
-            energyWindow = [];
-            
-            streamButton.textContent = 'Start Conversation';
-            streamButton.classList.remove('recording', 'processing');
-            streamButton.style.backgroundColor = ''; // Reset to default
-            
-            addSystemMessage('Conversation paused');
-            
-            // Send stop streaming signal to server
-            ws.send(JSON.stringify({
-                action: 'stop_streaming',
-                speaker: parseInt(speakerSelectEl.value)
-            }));
+            // Show/hide the label
+            visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
        }
        
-        // Clear conversation
-        function clearConversation() {
-            // Clear conversation history
-            ws.send(JSON.stringify({
-                action: 'clear_context'
-            }));
-            
-            // Clear the UI
-            conversationEl.innerHTML = '';
-            addSystemMessage('Conversation cleared');
-        }
-        
-        // Downsample audio buffer to target sample rate
+        // Downsample audio buffer
        function downsampleBuffer(buffer, sampleRate, targetSampleRate) {
            if (targetSampleRate === sampleRate) {
                return buffer;
@@ -538,7 +712,7 @@
            return result;
        }
        
-        // Create WAV blob from Float32Array
+        // Create WAV blob
        function createWavBlob(samples, sampleRate) {
            const buffer = new ArrayBuffer(44 + samples.length * 2);
            const view = new DataView(buffer);
@@ -562,8 +736,7 @@
            writeString(view, 36, 'data');
            view.setUint32(40, samples.length * 2, true);
            
-            // Write the PCM samples
-            const volume = 0.5;
+            // Write PCM samples
            for (let i = 0; i < samples.length; i++) {
                const sample = Math.max(-1, Math.min(1, samples[i]));
                view.setInt16(44 + i * 2, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true);
@@ -572,19 +745,19 @@
            return new Blob([buffer], { type: 'audio/wav' });
        }
        
+        // Write string to DataView
        function writeString(view, offset, string) {
            for (let i = 0; i < string.length; i++) {
                view.setUint8(offset + i, string.charCodeAt(i));
            }
        }
        
-        // Message display functions
+        // Add user transcription
        function addUserTranscription(text) {
-            // Find if there's already a pending user message
+            // Find or create user message
            let pendingMessage = document.querySelector('.message.user.pending');
            
            if (!pendingMessage) {
-                // Create a new message
                pendingMessage = document.createElement('div');
                pendingMessage.classList.add('message', 'user', 'pending');
                conversationEl.appendChild(pendingMessage);
@@ -595,6 +768,7 @@
            conversationEl.scrollTop = conversationEl.scrollHeight;
        }
        
+        // Add AI message
        function addAIMessage(text, audioSrc) {
            const messageEl = document.createElement('div');
            messageEl.classList.add('message', 'ai');
@@ -614,6 +788,7 @@
            conversationEl.scrollTop = conversationEl.scrollHeight;
        }
        
+        // Add system message
        function addSystemMessage(text) {
            const messageEl = document.createElement('div');
            messageEl.classList.add('message', 'system');
@@ -621,98 +796,6 @@
            conversationEl.appendChild(messageEl);
            conversationEl.scrollTop = conversationEl.scrollHeight;
        }
-        
-        // Setup the audio visualizer
-        function setupVisualizer() {
-            visualizerCanvas = document.getElementById('audioVisualizer');
-            canvasContext = visualizerCanvas.getContext('2d');
-            
-            // Set canvas size to match container
-            function resizeCanvas() {
-                const container = visualizerCanvas.parentElement;
-                visualizerCanvas.width = container.clientWidth;
-                visualizerCanvas.height = container.clientHeight;
-            }
-            
-            // Call initially and on window resize
-            resizeCanvas();
-            window.addEventListener('resize', resizeCanvas);
-            
-            // Create placeholder data array (will be used before streaming starts)
-            visualizerBufferLength = 128; // Default size
-            visualizerDataArray = new Uint8Array(visualizerBufferLength);
-        }
-        
-        // Add the visualization drawing function
-        function drawVisualizer() {
-            // Ensure we have the canvas context
-            if (!canvasContext) {
-                console.error("Canvas context not available");
-                return;
-            }
-            
-            visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
-            
-            // If we're streaming and have an analyzer, get the frequency data
-            if (isStreaming && analyser) {
-                try {
-                    analyser.getByteFrequencyData(visualizerDataArray);
-                } catch (e) {
-                    console.error("Error getting frequency data:", e);
-                }
-            } else {
-                // If not streaming, gradually reduce all values to create a fade-out effect
-                for (let i = 0; i < visualizerDataArray.length; i++) {
-                    visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
-                }
-            }
-            
-            // Clear the canvas with a very slight background
-            canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
-            canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
-            
-            // Calculate bar width based on canvas size and buffer length
-            const width = visualizerCanvas.width;
-            const height = visualizerCanvas.height;
-            const barCount = Math.min(visualizerBufferLength, 64); // Limit bars for performance
-            const barWidth = width / barCount - 1;  // Leave 1px gap
-            
-            // Draw bars
-            for (let i = 0; i < barCount; i++) {
-                // Use a logarithmic scale for better visualization of lower frequencies
-                const index = Math.floor(i * visualizerBufferLength / barCount);
-                const value = visualizerDataArray[index];
-                
-                // Scale height (values typically range from 0-255)
-                const barHeight = (value / 255) * height;
-                
-                // Position x coordinate
-                const x = i * (barWidth + 1);
-                
-                // Calculate gradient color based on frequency
-                const hue = 200 + (i / barCount * 60); // Blue to light-blue/cyan spectrum
-                const saturation = 90 - (value / 255 * 30); // More saturated for louder sounds
-                const lightness = 40 + (value / 255 * 30); // Brighter for louder sounds
-                
-                // Draw the bar
-                canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
-                canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
-                
-                // Add a subtle reflection
-                const gradientHeight = Math.min(10, barHeight / 3);
-                const gradient = canvasContext.createLinearGradient(
-                    0, height - barHeight, 
-                    0, height - barHeight + gradientHeight
-                );
-                gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
-                gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
-                canvasContext.fillStyle = gradient;
-                canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
-            }
-            
-            // Only show the label when not streaming
-            visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
-        }
    </script>
 </body>
 </html>
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -1,24 +1,20 @@
 import os
 import base64
 import json
-import asyncio
 import torch
 import torchaudio
 import numpy as np
-import io
 import whisperx
 from io import BytesIO
 from typing import List, Dict, Any, Optional
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
-from fastapi.responses import HTMLResponse, FileResponse
-from fastapi.staticfiles import StaticFiles
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
+from flask import Flask, request, send_from_directory, Response
+from flask_cors import CORS
+from flask_socketio import SocketIO, emit, disconnect
 from generator import load_csm_1b, Segment
-import uvicorn
 import time
 import gc
 from collections import deque
+from threading import Lock

 # Select device
 if torch.cuda.is_available():
@@ -36,73 +32,39 @@ print("Loading WhisperX model...")
 asr_model = whisperx.load_model("medium", device, compute_type="float16")
 print("WhisperX model loaded!")

-app = FastAPI()
-
-# Add CORS middleware to allow cross-origin requests
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # Allow all origins in development
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
+# Silence detection parameters
+SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
+SILENCE_DURATION_SEC = 1.0  # How long silence must persist

 # Define the base directory
 base_dir = os.path.dirname(os.path.abspath(__file__))
-
-# Mount a static files directory if you have any static assets like CSS or JS
 static_dir = os.path.join(base_dir, "static")
-os.makedirs(static_dir, exist_ok=True)  # Create the directory if it doesn't exist
-app.mount("/static", StaticFiles(directory=static_dir), name="static")
+os.makedirs(static_dir, exist_ok=True)

-# Define route to serve index.html as the main page
-@app.get("/", response_class=HTMLResponse)
-async def get_index():
-    try:
-        with open(os.path.join(base_dir, "index.html"), "r") as f:
-            return HTMLResponse(content=f.read())
-    except FileNotFoundError:
-        return HTMLResponse(content="<html><body><h1>Error: index.html not found</h1></body></html>")
+# Setup Flask
+app = Flask(__name__)
+CORS(app)
+socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')

-# Add a favicon endpoint (optional, but good to have)
-@app.get("/favicon.ico")
-async def get_favicon():
-    favicon_path = os.path.join(static_dir, "favicon.ico")
-    if os.path.exists(favicon_path):
-        return FileResponse(favicon_path)
-    else:
-        return HTMLResponse(status_code=204)  # No content
-
-# Connection manager to handle multiple clients
-class ConnectionManager:
-    def __init__(self):
-        self.active_connections: List[WebSocket] = []
-
-    async def connect(self, websocket: WebSocket):
-        await websocket.accept()
-        self.active_connections.append(websocket)
-
-    def disconnect(self, websocket: WebSocket):
-        self.active_connections.remove(websocket)
-
-manager = ConnectionManager()
-
-# Silence detection parameters
-SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
-SILENCE_DURATION_SEC = 1.0  # How long silence must persist to be considered "stopped talking"
+# Socket connection management
+thread = None
+thread_lock = Lock()
+active_clients = {}  # Map client_id to client context

 # Helper function to convert audio data
-async def decode_audio_data(audio_data: str) -> torch.Tensor:
+def decode_audio_data(audio_data: str) -> torch.Tensor:
    """Decode base64 audio data to a torch tensor"""
    try:
+        # Extract the actual base64 content
+        if ',' in audio_data:
+            audio_data = audio_data.split(',')[1]
+        
        # Decode base64 audio data
-        binary_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data)
+        binary_data = base64.b64decode(audio_data)
        
-        # Save to a temporary WAV file first
-        temp_file = BytesIO(binary_data)
-        
-        # Load audio from binary data, explicitly specifying the format
-        audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
+        # Load audio from binary data
+        with BytesIO(binary_data) as temp_file:
+            audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
        
        # Resample if needed
        if sample_rate != generator.sample_rate:
@@ -121,7 +83,7 @@ async def decode_audio_data(audio_data: str) -> torch.Tensor:
        return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence


-async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
+def encode_audio_data(audio_tensor: torch.Tensor) -> str:
    """Encode torch tensor audio to base64 string"""
    buf = BytesIO()
    torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
@@ -130,40 +92,36 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
    return f"data:audio/wav;base64,{audio_base64}"


-async def transcribe_audio(audio_tensor: torch.Tensor) -> str:
+def transcribe_audio(audio_tensor: torch.Tensor) -> str:
    """Transcribe audio using WhisperX"""
    try:
        # Save the tensor to a temporary file
-        temp_file = BytesIO()
-        torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
-        temp_file.seek(0)
-        
-        # Create a temporary file on disk (WhisperX requires a file path)
-        temp_path = "temp_audio.wav"
-        with open(temp_path, "wb") as f:
-            f.write(temp_file.read())
+        temp_path = os.path.join(base_dir, "temp_audio.wav")
+        torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
        
        # Load and transcribe the audio
        audio = whisperx.load_audio(temp_path)
        result = asr_model.transcribe(audio, batch_size=16)
        
        # Clean up
-        os.remove(temp_path)
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
        
        # Get the transcription text
        if result["segments"] and len(result["segments"]) > 0:
            # Combine all segments
            transcription = " ".join([segment["text"] for segment in result["segments"]])
-            print(f"Transcription: {transcription}")
            return transcription.strip()
        else:
            return ""
    except Exception as e:
        print(f"Error in transcription: {str(e)}")
+        if os.path.exists("temp_audio.wav"):
+            os.remove("temp_audio.wav")
        return ""


-async def generate_response(text: str, conversation_history: List[Segment]) -> str:
+def generate_response(text: str, conversation_history: List[Segment]) -> str:
    """Generate a contextual response based on the transcribed text"""
    # Simple response logic - can be replaced with a more sophisticated LLM in the future
    responses = {
@@ -191,311 +149,319 @@ async def generate_response(text: str, conversation_history: List[Segment]) -> s
    else:
        return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?"

+# Flask routes for serving static content
+@app.route('/')
+def index():
+    return send_from_directory(base_dir, 'index.html')

-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket):
-    await manager.connect(websocket)
-    context_segments = []  # Store conversation context
-    streaming_buffer = []  # Buffer for streaming audio chunks
-    is_streaming = False
+@app.route('/favicon.ico')
+def favicon():
+    if os.path.exists(os.path.join(static_dir, 'favicon.ico')):
+        return send_from_directory(static_dir, 'favicon.ico')
+    return Response(status=204)

-    # Variables for silence detection
-    last_active_time = time.time()
-    is_silence = False
-    energy_window = deque(maxlen=10)  # For tracking recent audio energy
+@app.route('/static/<path:path>')
+def serve_static(path):
+    return send_from_directory(static_dir, path)
+
+# Socket.IO event handlers
+@socketio.on('connect')
+def handle_connect():
+    client_id = request.sid
+    print(f"Client connected: {client_id}")
+    
+    # Initialize client context
+    active_clients[client_id] = {
+        'context_segments': [],
+        'streaming_buffer': [],
+        'is_streaming': False,
+        'is_silence': False,
+        'last_active_time': time.time(),
+        'energy_window': deque(maxlen=10)
+    }
+    
+    emit('status', {'type': 'connected', 'message': 'Connected to server'})
+
+@socketio.on('disconnect')
+def handle_disconnect():
+    client_id = request.sid
+    if client_id in active_clients:
+        del active_clients[client_id]
+    print(f"Client disconnected: {client_id}")
+
+@socketio.on('generate')
+def handle_generate(data):
+    client_id = request.sid
+    if client_id not in active_clients:
+        emit('error', {'message': 'Client not registered'})
+        return
    
    try:
-        while True:
-            # Receive JSON data from client
-            data = await websocket.receive_text()
-            request = json.loads(data)
+        text = data.get('text', '')
+        speaker_id = data.get('speaker', 0)
        
-            action = request.get("action")
+        print(f"Generating audio for: '{text}' with speaker {speaker_id}")
        
-            if action == "generate":
-                try:
-                    text = request.get("text", "")
-                    speaker_id = request.get("speaker", 0)
+        # Generate audio response
+        audio_tensor = generator.generate(
+            text=text,
+            speaker=speaker_id,
+            context=active_clients[client_id]['context_segments'],
+            max_audio_length_ms=10_000,
+        )
        
-                    # Generate audio response
-                    print(f"Generating audio for: '{text}' with speaker {speaker_id}")
-                    audio_tensor = generator.generate(
-                        text=text,
-                        speaker=speaker_id,
-                        context=context_segments,
-                        max_audio_length_ms=10_000,
-                    )
+        # Add to conversation context
+        active_clients[client_id]['context_segments'].append(
+            Segment(text=text, speaker=speaker_id, audio=audio_tensor)
+        )
        
-                    # Add to conversation context
-                    context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
+        # Convert audio to base64 and send back to client
+        audio_base64 = encode_audio_data(audio_tensor)
+        emit('audio_response', {
+            'type': 'audio_response',
+            'audio': audio_base64
+        })
        
-                    # Convert audio to base64 and send back to client
-                    audio_base64 = await encode_audio_data(audio_tensor)
-                    await websocket.send_json({
-                        "type": "audio_response",
-                        "audio": audio_base64
-                    })
-                except Exception as e:
-                    print(f"Error generating audio: {str(e)}")
-                    await websocket.send_json({
-                        "type": "error",
-                        "message": f"Error generating audio: {str(e)}"
-                    })
-                
-            elif action == "add_to_context":
-                try:
-                    text = request.get("text", "")
-                    speaker_id = request.get("speaker", 0)
-                    audio_data = request.get("audio", "")
-                    
-                    # Convert received audio to tensor
-                    audio_tensor = await decode_audio_data(audio_data)
-                    
-                    # Add to conversation context
-                    context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
-                    
-                    await websocket.send_json({
-                        "type": "context_updated",
-                        "message": "Audio added to context"
-                    })
-                except Exception as e:
-                    print(f"Error adding to context: {str(e)}")
-                    await websocket.send_json({
-                        "type": "error",
-                        "message": f"Error processing audio: {str(e)}"
-                    })
-                
-            elif action == "clear_context":
-                context_segments = []
-                await websocket.send_json({
-                    "type": "context_updated",
-                    "message": "Context cleared"
-                })
-            
-            elif action == "stream_audio":
-                try:
-                    speaker_id = request.get("speaker", 0)
-                    audio_data = request.get("audio", "")
-                    
-                    # Convert received audio to tensor
-                    audio_chunk = await decode_audio_data(audio_data)
-                    
-                    # Start streaming mode if not already started
-                    if not is_streaming:
-                        is_streaming = True
-                        streaming_buffer = []
-                        energy_window.clear()
-                        is_silence = False
-                        last_active_time = time.time()
-                        print(f"Streaming started with speaker ID: {speaker_id}")
-                        await websocket.send_json({
-                            "type": "streaming_status",
-                            "status": "started"
-                        })
-                    
-                    # Calculate audio energy for silence detection
-                    chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
-                    energy_window.append(chunk_energy)
-                    avg_energy = sum(energy_window) / len(energy_window)
-                    
-                    # Debug audio levels
-                    if len(energy_window) >= 5:  # Only start printing after we have enough samples
-                        if avg_energy > SILENCE_THRESHOLD:
-                            print(f"[AUDIO] Active sound detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
-                        else:
-                            print(f"[AUDIO] Silence detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
-                    
-                    # Check if audio is silent
-                    current_silence = avg_energy < SILENCE_THRESHOLD
-                    
-                    # Track silence transition
-                    if not is_silence and current_silence:
-                        # Transition to silence
-                        is_silence = True
-                        last_active_time = time.time()
-                        print("[STREAM] Transition to silence detected")
-                    elif is_silence and not current_silence:
-                        # User started talking again
-                        is_silence = False
-                        print("[STREAM] User resumed speaking")
-                    
-                    # Add chunk to buffer regardless of silence state
-                    streaming_buffer.append(audio_chunk)
-                    
-                    # Debug buffer size periodically
-                    if len(streaming_buffer) % 10 == 0:
-                        print(f"[BUFFER] Current size: {len(streaming_buffer)} chunks, ~{len(streaming_buffer)/5:.1f} seconds")
-                        
-                    # Check if silence has persisted long enough to consider "stopped talking"
-                    silence_elapsed = time.time() - last_active_time
-                    
-                    if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0:
-                        # User has stopped talking - process the collected audio
-                        print(f"[STREAM] Processing audio after {silence_elapsed:.2f}s of silence")
-                        print(f"[STREAM] Processing {len(streaming_buffer)} audio chunks (~{len(streaming_buffer)/5:.1f} seconds)")
-                        
-                        full_audio = torch.cat(streaming_buffer, dim=0)
-                        
-                        # Log audio statistics
-                        audio_duration = len(full_audio) / generator.sample_rate
-                        audio_min = torch.min(full_audio).item()
-                        audio_max = torch.max(full_audio).item()
-                        audio_mean = torch.mean(full_audio).item()
-                        print(f"[AUDIO] Processed audio - Duration: {audio_duration:.2f}s, Min: {audio_min:.4f}, Max: {audio_max:.4f}, Mean: {audio_mean:.4f}")
-                        
-                        # Process with WhisperX speech-to-text
-                        print("[ASR] Starting transcription with WhisperX...")
-                        transcribed_text = await transcribe_audio(full_audio)
-                        
-                        # Log the transcription
-                        print(f"[ASR] Transcribed text: '{transcribed_text}'")
-                        
-                        # Add to conversation context
-                        if transcribed_text:
-                            print(f"[DIALOG] Adding user utterance to context: '{transcribed_text}'")
-                            user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
-                            context_segments.append(user_segment)
-                            
-                            # Generate a contextual response
-                            print("[DIALOG] Generating response...")
-                            response_text = await generate_response(transcribed_text, context_segments)
-                            print(f"[DIALOG] Response text: '{response_text}'")
-                            
-                            # Send the transcribed text to client
-                            await websocket.send_json({
-                                "type": "transcription",
-                                "text": transcribed_text
-                            })
-                            
-                            # Generate audio for the response
-                            print("[TTS] Generating speech for response...")
-                            audio_tensor = generator.generate(
-                                text=response_text,
-                                speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
-                                context=context_segments,
-                                max_audio_length_ms=10_000,
-                            )
-                            print(f"[TTS] Generated audio length: {len(audio_tensor)/generator.sample_rate:.2f}s")
-                            
-                            # Add response to context
-                            ai_segment = Segment(
-                                text=response_text, 
-                                speaker=1 if speaker_id == 0 else 0, 
-                                audio=audio_tensor
-                            )
-                            context_segments.append(ai_segment)
-                            print(f"[DIALOG] Context now has {len(context_segments)} segments")
-                            
-                            # Convert audio to base64 and send back to client
-                            audio_base64 = await encode_audio_data(audio_tensor)
-                            print("[STREAM] Sending audio response to client")
-                            await websocket.send_json({
-                                "type": "audio_response",
-                                "text": response_text,
-                                "audio": audio_base64
-                            })
-                        else:
-                            print("[ASR] Transcription failed or returned empty text")
-                            # If transcription failed, send a generic response
-                            await websocket.send_json({
-                                "type": "error",
-                                "message": "Sorry, I couldn't understand what you said. Could you try again?"
-                            })
-                        
-                        # Clear buffer and reset silence detection
-                        streaming_buffer = []
-                        energy_window.clear()
-                        is_silence = False
-                        last_active_time = time.time()
-                        print("[STREAM] Buffer cleared, ready for next utterance")
-                    
-                    # If buffer gets too large without silence, process it anyway
-                    # This prevents memory issues with very long streams
-                    elif len(streaming_buffer) >= 30:  # ~6 seconds of audio at 5 chunks/sec
-                        print("[BUFFER] Maximum buffer size reached, processing audio")
-                        full_audio = torch.cat(streaming_buffer, dim=0)
-                        
-                        # Process with WhisperX speech-to-text
-                        print("[ASR] Starting forced transcription of long audio...")
-                        transcribed_text = await transcribe_audio(full_audio)
-                        
-                        if transcribed_text:
-                            print(f"[ASR] Transcribed long audio: '{transcribed_text}'")
-                            context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio))
-                            
-                            # Send the transcribed text to client
-                            await websocket.send_json({
-                                "type": "transcription",
-                                "text": transcribed_text + " (processing continued speech...)"
-                            })
-                        else:
-                            print("[ASR] No transcription from long audio")
-                        
-                        streaming_buffer = []
-                        print("[BUFFER] Buffer cleared due to size limit")
-                        
-                except Exception as e:
-                    print(f"[ERROR] Processing streaming audio: {str(e)}")
-                    # Print traceback for more detailed error information
-                    import traceback
-                    traceback.print_exc()
-                    await websocket.send_json({
-                        "type": "error",
-                        "message": f"Error processing streaming audio: {str(e)}"
-                    })
-            
-            elif action == "stop_streaming":
-                is_streaming = False
-                if streaming_buffer and len(streaming_buffer) > 5:  # Only process if there's meaningful audio
-                    # Process any remaining audio in the buffer
-                    full_audio = torch.cat(streaming_buffer, dim=0)
-                    
-                    # Process with WhisperX speech-to-text
-                    transcribed_text = await transcribe_audio(full_audio)
-                    
-                    if transcribed_text:
-                        context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio))
-                        
-                        # Send the transcribed text to client
-                        await websocket.send_json({
-                            "type": "transcription",
-                            "text": transcribed_text
-                        })
-                
-                streaming_buffer = []
-                await websocket.send_json({
-                    "type": "streaming_status",
-                    "status": "stopped"
-                })
-    
-    except WebSocketDisconnect:
-        manager.disconnect(websocket)
-        print("Client disconnected")
    except Exception as e:
-        print(f"Error: {str(e)}")
-        try:
-            await websocket.send_json({
-                "type": "error",
-                "message": str(e)
-            })
-        except:
-            pass
-        manager.disconnect(websocket)
+        print(f"Error generating audio: {str(e)}")
+        emit('error', {
+            'type': 'error',
+            'message': f"Error generating audio: {str(e)}"
+        })
+
+@socketio.on('add_to_context')
+def handle_add_to_context(data):
+    client_id = request.sid
+    if client_id not in active_clients:
+        emit('error', {'message': 'Client not registered'})
+        return
+    
+    try:
+        text = data.get('text', '')
+        speaker_id = data.get('speaker', 0)
+        audio_data = data.get('audio', '')
+        
+        # Convert received audio to tensor
+        audio_tensor = decode_audio_data(audio_data)
+        
+        # Add to conversation context
+        active_clients[client_id]['context_segments'].append(
+            Segment(text=text, speaker=speaker_id, audio=audio_tensor)
+        )
+        
+        emit('context_updated', {
+            'type': 'context_updated',
+            'message': 'Audio added to context'
+        })
+        
+    except Exception as e:
+        print(f"Error adding to context: {str(e)}")
+        emit('error', {
+            'type': 'error',
+            'message': f"Error processing audio: {str(e)}"
+        })
+
+@socketio.on('clear_context')
+def handle_clear_context():
+    client_id = request.sid
+    if client_id in active_clients:
+        active_clients[client_id]['context_segments'] = []
+        
+    emit('context_updated', {
+        'type': 'context_updated',
+        'message': 'Context cleared'
+    })
+
+@socketio.on('stream_audio')
+def handle_stream_audio(data):
+    client_id = request.sid
+    if client_id not in active_clients:
+        emit('error', {'message': 'Client not registered'})
+        return
+    
+    client = active_clients[client_id]
+    
+    try:
+        speaker_id = data.get('speaker', 0)
+        audio_data = data.get('audio', '')
+        
+        # Convert received audio to tensor
+        audio_chunk = decode_audio_data(audio_data)
+        
+        # Start streaming mode if not already started
+        if not client['is_streaming']:
+            client['is_streaming'] = True
+            client['streaming_buffer'] = []
+            client['energy_window'].clear()
+            client['is_silence'] = False
+            client['last_active_time'] = time.time()
+            print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}")
+            emit('streaming_status', {
+                'type': 'streaming_status',
+                'status': 'started'
+            })
+        
+        # Calculate audio energy for silence detection
+        chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
+        client['energy_window'].append(chunk_energy)
+        avg_energy = sum(client['energy_window']) / len(client['energy_window'])
+        
+        # Check if audio is silent
+        current_silence = avg_energy < SILENCE_THRESHOLD
+        
+        # Track silence transition
+        if not client['is_silence'] and current_silence:
+            # Transition to silence
+            client['is_silence'] = True
+            client['last_active_time'] = time.time()
+        elif client['is_silence'] and not current_silence:
+            # User started talking again
+            client['is_silence'] = False
+        
+        # Add chunk to buffer regardless of silence state
+        client['streaming_buffer'].append(audio_chunk)
+            
+        # Check if silence has persisted long enough to consider "stopped talking"
+        silence_elapsed = time.time() - client['last_active_time']
+        
+        if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0:
+            # User has stopped talking - process the collected audio
+            print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence")
+            
+            full_audio = torch.cat(client['streaming_buffer'], dim=0)
+            
+            # Process with WhisperX speech-to-text
+            print(f"[{client_id}] Starting transcription with WhisperX...")
+            transcribed_text = transcribe_audio(full_audio)
+            
+            # Log the transcription
+            print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
+            
+            # Add to conversation context
+            if transcribed_text:
+                user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
+                client['context_segments'].append(user_segment)
+                
+                # Generate a contextual response
+                response_text = generate_response(transcribed_text, client['context_segments'])
+                
+                # Send the transcribed text to client
+                emit('transcription', {
+                    'type': 'transcription',
+                    'text': transcribed_text
+                })
+                
+                # Generate audio for the response
+                audio_tensor = generator.generate(
+                    text=response_text,
+                    speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
+                    context=client['context_segments'],
+                    max_audio_length_ms=10_000,
+                )
+                
+                # Add response to context
+                ai_segment = Segment(
+                    text=response_text, 
+                    speaker=1 if speaker_id == 0 else 0, 
+                    audio=audio_tensor
+                )
+                client['context_segments'].append(ai_segment)
+                
+                # Convert audio to base64 and send back to client
+                audio_base64 = encode_audio_data(audio_tensor)
+                emit('audio_response', {
+                    'type': 'audio_response',
+                    'text': response_text,
+                    'audio': audio_base64
+                })
+            else:
+                # If transcription failed, send a generic response
+                emit('error', {
+                    'type': 'error',
+                    'message': "Sorry, I couldn't understand what you said. Could you try again?"
+                })
+            
+            # Clear buffer and reset silence detection
+            client['streaming_buffer'] = []
+            client['energy_window'].clear()
+            client['is_silence'] = False
+            client['last_active_time'] = time.time()
+        
+        # If buffer gets too large without silence, process it anyway
+        elif len(client['streaming_buffer']) >= 30:  # ~6 seconds of audio at 5 chunks/sec
+            full_audio = torch.cat(client['streaming_buffer'], dim=0)
+            
+            # Process with WhisperX speech-to-text
+            transcribed_text = transcribe_audio(full_audio)
+            
+            if transcribed_text:
+                client['context_segments'].append(
+                    Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
+                )
+                
+                # Send the transcribed text to client
+                emit('transcription', {
+                    'type': 'transcription',
+                    'text': transcribed_text + " (processing continued speech...)"
+                })
+            
+            client['streaming_buffer'] = []
+            
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        print(f"Error processing streaming audio: {str(e)}")
+        emit('error', {
+            'type': 'error',
+            'message': f"Error processing streaming audio: {str(e)}"
+        })
+
+@socketio.on('stop_streaming')
+def handle_stop_streaming(data):
+    client_id = request.sid
+    if client_id not in active_clients:
+        return
+    
+    client = active_clients[client_id]
+    client['is_streaming'] = False
+    
+    if client['streaming_buffer'] and len(client['streaming_buffer']) > 5:
+        # Process any remaining audio in the buffer
+        full_audio = torch.cat(client['streaming_buffer'], dim=0)
+        
+        # Process with WhisperX speech-to-text
+        transcribed_text = transcribe_audio(full_audio)
+        
+        if transcribed_text:
+            client['context_segments'].append(
+                Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio)
+            )
+            
+            # Send the transcribed text to client
+            emit('transcription', {
+                'type': 'transcription',
+                'text': transcribed_text
+            })
+    
+    client['streaming_buffer'] = []
+    emit('streaming_status', {
+        'type': 'streaming_status',
+        'status': 'stopped'
+    })

-# Update the __main__ block with a comprehensive server startup message
 if __name__ == "__main__":
    print(f"\n{'='*60}")
-    print(f"🔊 Sesame AI Voice Chat Server")
+    print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
    print(f"{'='*60}")
    print(f"📡 Server Information:")
-    print(f"   - Local URL: http://localhost:8000")
-    print(f"   - Network URL: http://<your-ip-address>:8000")
-    print(f"   - WebSocket: ws://<your-ip-address>:8000/ws")
+    print(f"   - Local URL: http://localhost:5000")
+    print(f"   - Network URL: http://<your-ip-address>:5000")
+    print(f"   - WebSocket: ws://<your-ip-address>:5000/socket.io")
    print(f"{'='*60}")
    print(f"💡 To make this server public:")
-    print(f"   1. Ensure port 8000 is open in your firewall")
-    print(f"   2. Set up port forwarding on your router to port 8000")
-    print(f"   3. Or use a service like ngrok with: ngrok http 8000")
+    print(f"   1. Ensure port 5000 is open in your firewall")
+    print(f"   2. Set up port forwarding on your router to port 5000")
+    print(f"   3. Or use a service like ngrok with: ngrok http 5000")
    print(f"{'='*60}")
    print(f"🌐 Device: {device.upper()}")
    print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})")
@@ -503,5 +469,4 @@ if __name__ == "__main__":
    print(f"{'='*60}")
    print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")
    
-    # Start the server
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    socketio.run(app, host="0.0.0.0", port=5000, debug=False)