merge

2025-03-30 09:27:10 -04:00
parent 842496c053 d818c86463
commit 9f10ba895c
8 changed files with 1356 additions and 632 deletions
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -3,454 +3,266 @@
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Voice Assistant - CSM & Whisper</title>
-    <script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
+    <title>AI Voice Chat</title>
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            margin: 0;
+            padding: 0;
+            background-color: #f5f5f5;
+            color: #333;
+        }
+        .container {
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
-            background-color: #f5f7fa;
-            color: #333;
        }
-        
-        h1 {
-            color: #2c3e50;
+        header {
            text-align: center;
            margin-bottom: 30px;
        }
-        
-        #conversation {
-            height: 400px;
-            border: 1px solid #ddd;
-            border-radius: 10px;
-            padding: 20px;
-            margin-bottom: 20px;
-            overflow-y: auto;
-            background-color: white;
-            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+        h1 {
+            color: #2c3e50;
        }
-        
-        .message-container {
-            display: flex;
-            flex-direction: column;
-            margin-bottom: 15px;
-        }
-        
-        .user-message-container {
-            align-items: flex-end;
-        }
-        
-        .bot-message-container {
-            align-items: flex-start;
-        }
-        
-        .message {
-            max-width: 80%;
-            padding: 12px;
-            border-radius: 18px;
-            position: relative;
-            word-break: break-word;
-        }
-        
-        .user-message {
-            background-color: #dcf8c6;
-            color: #000;
-            border-bottom-right-radius: 4px;
-        }
-        
-        .bot-message {
-            background-color: #f1f0f0;
-            color: #000;
-            border-bottom-left-radius: 4px;
-        }
-        
-        .message-label {
-            font-size: 0.8em;
-            margin-bottom: 4px;
-            color: #657786;
-        }
-        
-        #controls {
-            display: flex;
-            gap: 10px;
-            justify-content: center;
-            margin-bottom: 15px;
-        }
-        
-        button {
-            padding: 12px 24px;
-            font-size: 16px;
-            cursor: pointer;
-            border-radius: 50px;
-            border: none;
-            outline: none;
-            transition: all 0.3s ease;
-        }
-        
-        #recordButton {
-            background-color: #4CAF50;
+        .status-bar {
+            background-color: #2c3e50;
            color: white;
-            width: 200px;
-            box-shadow: 0 4px 8px rgba(76, 175, 80, 0.3);
+            padding: 10px;
+            border-radius: 5px;
+            margin-bottom: 20px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
        }
-        
-        #recordButton:hover {
-            background-color: #45a049;
-            transform: translateY(-2px);
+        .status-indicator {
+            display: flex;
+            align-items: center;
        }
+        .status-dot {
+            height: 10px;
+            width: 10px;
+            border-radius: 50%;
+            margin-right: 8px;
+        }
+        .status-dot.connected { background-color: #2ecc71; }
+        .status-dot.connecting { background-color: #f39c12; }
+        .status-dot.disconnected { background-color: #e74c3c; }
        
-        #recordButton.recording {
-            background-color: #f44336;
+        .conversation {
+            background-color: white;
+            border-radius: 10px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+            height: 400px;
+            padding: 20px;
+            overflow-y: auto;
+            margin-bottom: 20px;
+        }
+        .message {
+            margin-bottom: 15px;
+            padding: 10px 15px;
+            border-radius: 18px;
+            max-width: 80%;
+            word-wrap: break-word;
+        }
+        .user-message {
+            background-color: #e3f2fd;
+            margin-left: auto;
+            border-bottom-right-radius: 5px;
+        }
+        .ai-message {
+            background-color: #f0f0f0;
+            margin-right: auto;
+            border-bottom-left-radius: 5px;
+        }
+        .controls {
+            display: flex;
+            justify-content: center;
+            gap: 15px;
+            margin-bottom: 20px;
+        }
+        button {
+            background-color: #2c3e50;
+            color: white;
+            border: none;
+            padding: 12px 24px;
+            border-radius: 25px;
+            cursor: pointer;
+            font-size: 16px;
+            transition: all 0.2s;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            gap: 8px;
+        }
+        button:hover {
+            background-color: #1a252f;
+        }
+        button:disabled {
+            background-color: #95a5a6;
+            cursor: not-allowed;
+        }
+        .button-icon {
+            width: 20px;
+            height: 20px;
+        }
+        .mic-animation {
+            width: 60px;
+            height: 60px;
+            border-radius: 50%;
+            background-color: rgba(231, 76, 60, 0.2);
+            display: flex;
+            align-items: center;
+            justify-content: center;
            animation: pulse 1.5s infinite;
-            box-shadow: 0 4px 8px rgba(244, 67, 54, 0.3);
+            margin: 0 auto 15px;
        }
-        
        @keyframes pulse {
            0% {
-                transform: scale(1);
+                transform: scale(0.95);
+                box-shadow: 0 0 0 0 rgba(231, 76, 60, 0.5);
            }
-            50% {
-                transform: scale(1.05);
+            70% {
+                transform: scale(1);
+                box-shadow: 0 0 0 15px rgba(231, 76, 60, 0);
            }
            100% {
-                transform: scale(1);
+                transform: scale(0.95);
+                box-shadow: 0 0 0 0 rgba(231, 76, 60, 0);
            }
        }
-        
-        #status {
+        .settings {
+            margin-top: 20px;
+            padding: 15px;
+            background-color: white;
+            border-radius: 10px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }
+        .settings h3 {
+            margin-top: 0;
+            color: #2c3e50;
+            border-bottom: 1px solid #eee;
+            padding-bottom: 10px;
+        }
+        .settings-grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+            gap: 15px;
+        }
+        .setting-item {
+            padding: 10px;
+            background-color: #f9f9f9;
+            border-radius: 5px;
+        }
+        .audio-visualizer {
+            height: 50px;
+            width: 100%;
+            background-color: #f0f0f0;
+            margin-top: 10px;
+            border-radius: 5px;
+            overflow: hidden;
+        }
+        .info-message {
            text-align: center;
-            margin-top: 15px;
+            color: #7f8c8d;
+            margin: 10px 0;
            font-style: italic;
-            color: #657786;
        }
-        
-        .audio-wave {
-            display: flex;
-            justify-content: center;
-            align-items: center;
-            height: 40px;
-            gap: 3px;
+        .loading {
+            text-align: center;
+            margin: 20px 0;
        }
-        
-        .audio-wave span {
-            display: block;
-            width: 3px;
-            height: 100%;
-            background-color: #4CAF50;
-            animation: wave 1.5s infinite ease-in-out;
-            border-radius: 6px;
-        }
-        
-        .audio-wave span:nth-child(2) {
-            animation-delay: 0.2s;
-        }
-        .audio-wave span:nth-child(3) {
-            animation-delay: 0.4s;
-        }
-        .audio-wave span:nth-child(4) {
-            animation-delay: 0.6s;
-        }
-        .audio-wave span:nth-child(5) {
-            animation-delay: 0.8s;
-        }
-        
-        @keyframes wave {
-            0%, 100% {
-                height: 8px;
-            }
-            50% {
+        .spinner {
+            border: 4px solid rgba(0, 0, 0, 0.1);
+            border-radius: 50%;
+            border-top: 4px solid #2c3e50;
+            width: 30px;
            height: 30px;
+            animation: spin 1s linear infinite;
+            margin: 0 auto 10px;
        }
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
        }
-        
-        .hidden {
-            display: none;
-        }
-        
-        .transcription-info {
-            font-size: 0.8em;
-            color: #888;
-            margin-top: 4px;
-            text-align: right;
+        footer {
+            text-align: center;
+            margin-top: 30px;
+            padding: 20px;
+            color: #7f8c8d;
+            font-size: 14px;
        }
    </style>
 </head>
 <body>
-    <h1>Voice Assistant with CSM & Whisper</h1>
-    <div id="conversation"></div>
+    <div class="container">
+        <header>
+            <h1>AI Voice Assistant</h1>
+        </header>

-    <div id="controls">
-        <button id="recordButton">Hold to Speak</button>
+        <div class="status-bar">
+            <div class="status-indicator">
+                <div class="status-dot disconnected" id="connection-dot"></div>
+                <span id="connection-status">Disconnected</span>
+            </div>
+            <div id="runtime-info">
+                <span id="models-status"></span>
+            </div>
        </div>

-    <div id="audioWave" class="audio-wave hidden">
-        <span></span>
-        <span></span>
-        <span></span>
-        <span></span>
-        <span></span>
+        <div class="conversation" id="conversation">
+            <div class="info-message">Your conversation will appear here.</div>
        </div>

-    <div id="status">Connecting to server...</div>
+        <div id="mic-animation" class="mic-animation" style="display: none;">
+            <svg width="24" height="24" viewBox="0 0 24 24" fill="white">
+                <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
+            </svg>
+        </div>

-    <script>
-        const socket = io();
-        const recordButton = document.getElementById('recordButton');
-        const conversation = document.getElementById('conversation');
-        const status = document.getElementById('status');
-        const audioWave = document.getElementById('audioWave');
+        <div class="controls">
+            <button id="start-button" disabled>
+                <svg class="button-icon" viewBox="0 0 24 24" fill="white">
+                    <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
+                </svg>
+                Start Listening
+            </button>
+            <button id="interrupt-button" disabled>
+                <svg class="button-icon" viewBox="0 0 24 24" fill="white">
+                    <path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"></path>
+                </svg>
+                Interrupt
+            </button>
+        </div>

-        let mediaRecorder;
-        let audioChunks = [];
-        let isRecording = false;
-        let audioSendInterval;
-        let sessionActive = false;
+        <div id="loading" class="loading" style="display: none;">
+            <div class="spinner"></div>
+            <p id="loading-text">Processing your speech...</p>
+        </div>

-        // Initialize audio context
-        const audioContext = new (window.AudioContext || window.webkitAudioContext)();
+        <div class="settings">
+            <h3>Status</h3>
+            <div class="settings-grid">
+                <div class="setting-item">
+                    <div><strong>Whisper Model:</strong> <span id="whisper-status">Loading...</span></div>
+                </div>
+                <div class="setting-item">
+                    <div><strong>CSM Audio Model:</strong> <span id="csm-status">Loading...</span></div>
+                </div>
+                <div class="setting-item">
+                    <div><strong>LLM Model:</strong> <span id="llm-status">Loading...</span></div>
+                </div>
+                <div class="setting-item">
+                    <div><strong>WebRTC:</strong> <span id="webrtc-status">Not Connected</span></div>
+                </div>
+            </div>
+        </div>
+    </div>

-        // Connect to server
-        socket.on('connect', () => {
-            status.textContent = 'Connected to server';
-            sessionActive = true;
-        });
+    <footer>
+        <p>AI Voice Assistant | Using Fast Whisper, Llama 3.2, and CSM Audio Models</p>
+    </footer>

-        socket.on('disconnect', () => {
-            status.textContent = 'Disconnected from server';
-            sessionActive = false;
-        });
-        
-        socket.on('ready', (data) => {
-            status.textContent = data.message;
-            setupAudioRecording();
-        });
-        
-        socket.on('transcription', (data) => {
-            addMessage('user', data.text);
-            status.textContent = 'Assistant is thinking...';
-        });
-        
-        socket.on('audio_response', (data) => {
-            // Play audio
-            status.textContent = 'Playing response...';
-            const audio = new Audio('data:audio/wav;base64,' + data.audio);
-            
-            audio.onended = () => {
-                status.textContent = 'Ready to record';
-            };
-            
-            audio.onerror = () => {
-                status.textContent = 'Error playing audio';
-                console.error('Error playing audio response');
-            };
-            
-            audio.play().catch(err => {
-                status.textContent = 'Error playing audio: ' + err.message;
-                console.error('Error playing audio:', err);
-            });
-            
-            // Display text
-            addMessage('bot', data.text);
-        });
-        
-        socket.on('error', (data) => {
-            status.textContent = 'Error: ' + data.message;
-            console.error('Server error:', data.message);
-        });
-        
-        function setupAudioRecording() {
-            // Check if browser supports required APIs
-            if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
-                status.textContent = 'Your browser does not support audio recording';
-                return;
-            }
-            
-            // Get user media
-            navigator.mediaDevices.getUserMedia({ audio: true })
-                .then(stream => {
-                    // Setup recording with better audio quality
-                    const options = { 
-                        mimeType: 'audio/webm',
-                        audioBitsPerSecond: 128000 
-                    };
-                    
-                    try {
-                        mediaRecorder = new MediaRecorder(stream, options);
-                    } catch (e) {
-                        // Fallback if the specified options aren't supported
-                        mediaRecorder = new MediaRecorder(stream);
-                    }
-                    
-                    mediaRecorder.ondataavailable = event => {
-                        if (event.data.size > 0) {
-                            audioChunks.push(event.data);
-                        }
-                    };
-                    
-                    mediaRecorder.onstop = () => {
-                        processRecording();
-                    };
-                    
-                    // Create audio analyzer for visualization
-                    const source = audioContext.createMediaStreamSource(stream);
-                    const analyzer = audioContext.createAnalyser();
-                    analyzer.fftSize = 2048;
-                    source.connect(analyzer);
-                    
-                    // Setup button handlers with better touch handling
-                    recordButton.addEventListener('mousedown', startRecording);
-                    recordButton.addEventListener('touchstart', (e) => {
-                        e.preventDefault(); // Prevent default touch behavior
-                        startRecording();
-                    });
-                    
-                    recordButton.addEventListener('mouseup', stopRecording);
-                    recordButton.addEventListener('touchend', (e) => {
-                        e.preventDefault();
-                        stopRecording();
-                    });
-                    
-                    recordButton.addEventListener('mouseleave', stopRecording);
-                    
-                    status.textContent = 'Ready to record';
-                })
-                .catch(err => {
-                    status.textContent = 'Error accessing microphone: ' + err.message;
-                    console.error('Error accessing microphone:', err);
-                });
-        }
-        
-        function startRecording() {
-            if (!isRecording && sessionActive) {
-                audioChunks = [];
-                mediaRecorder.start(100); // Collect data in 100ms chunks
-                recordButton.classList.add('recording');
-                recordButton.textContent = 'Release to Stop';
-                status.textContent = 'Recording...';
-                audioWave.classList.remove('hidden');
-                isRecording = true;
-                
-                socket.emit('start_speaking');
-                
-                // Start sending audio chunks periodically
-                audioSendInterval = setInterval(() => {
-                    if (mediaRecorder.state === 'recording') {
-                        mediaRecorder.requestData(); // Force ondataavailable to fire
-                    }
-                }, 300); // Send every 300ms
-            }
-        }
-        
-        function stopRecording() {
-            if (isRecording) {
-                clearInterval(audioSendInterval);
-                mediaRecorder.stop();
-                recordButton.classList.remove('recording');
-                recordButton.textContent = 'Hold to Speak';
-                status.textContent = 'Processing speech...';
-                audioWave.classList.add('hidden');
-                isRecording = false;
-            }
-        }
-        
-        function processRecording() {
-            if (audioChunks.length === 0) {
-                status.textContent = 'No audio recorded';
-                return;
-            }
-            
-            const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
-            
-            // Convert to ArrayBuffer for processing
-            const fileReader = new FileReader();
-            fileReader.onloadend = () => {
-                try {
-                    const arrayBuffer = fileReader.result;
-                    // Convert to Float32Array - this works better with WebAudio API
-                    const audioData = convertToFloat32(arrayBuffer);
-                    
-                    // Convert to base64 for sending
-                    const base64String = arrayBufferToBase64(audioData.buffer);
-                    socket.emit('audio_chunk', { audio: base64String });
-                    
-                    // Signal end of speech
-                    socket.emit('stop_speaking');
-                } catch (e) {
-                    console.error('Error processing audio:', e);
-                    status.textContent = 'Error processing audio';
-                }
-            };
-            
-            fileReader.onerror = () => {
-                status.textContent = 'Error reading audio data';
-            };
-            
-            fileReader.readAsArrayBuffer(audioBlob);
-        }
-        
-        function convertToFloat32(arrayBuffer) {
-            // Get raw audio data as Int16 (common format for audio)
-            const int16Array = new Int16Array(arrayBuffer);
-            
-            // Convert to Float32 (normalize between -1 and 1)
-            const float32Array = new Float32Array(int16Array.length);
-            for (let i = 0; i < int16Array.length; i++) {
-                float32Array[i] = int16Array[i] / 32768.0;
-            }
-            
-            return float32Array;
-        }
-        
-        function addMessage(sender, text) {
-            const containerDiv = document.createElement('div');
-            containerDiv.className = sender === 'user' ? 'message-container user-message-container' : 'message-container bot-message-container';
-            
-            const labelDiv = document.createElement('div');
-            labelDiv.className = 'message-label';
-            labelDiv.textContent = sender === 'user' ? 'You' : 'Assistant';
-            containerDiv.appendChild(labelDiv);
-            
-            const messageDiv = document.createElement('div');
-            messageDiv.className = sender === 'user' ? 'message user-message' : 'message bot-message';
-            messageDiv.textContent = text;
-            containerDiv.appendChild(messageDiv);
-            
-            if (sender === 'user') {
-                const infoDiv = document.createElement('div');
-                infoDiv.className = 'transcription-info';
-                infoDiv.textContent = 'Transcribed with Whisper';
-                containerDiv.appendChild(infoDiv);
-            }
-            
-            conversation.appendChild(containerDiv);
-            conversation.scrollTop = conversation.scrollHeight;
-        }
-        
-        function arrayBufferToBase64(buffer) {
-            let binary = '';
-            const bytes = new Uint8Array(buffer);
-            const len = bytes.byteLength;
-            for (let i = 0; i < len; i++) {
-                binary += String.fromCharCode(bytes[i]);
-            }
-            return window.btoa(binary);
-        }
-        
-        // Handle page visibility change to avoid issues with background tabs
-        document.addEventListener('visibilitychange', () => {
-            if (document.hidden && isRecording) {
-                stopRecording();
-            }
-        });
-        
-        // Clean disconnection when page is closed
-        window.addEventListener('beforeunload', () => {
-            if (socket && socket.connected) {
-                socket.disconnect();
-            }
-        });
-    </script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.6.1/socket.io.min.js"></script>
+    <script src="./voice-chat.js"></script>
 </body>
 </html>
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -8,15 +8,14 @@ import numpy as np
 from flask import Flask, render_template, request
 from flask_socketio import SocketIO, emit
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from collections import deque
+import threading
+import queue
 import requests
 import huggingface_hub
 from generator import load_csm_1b, Segment
-
-# Force CPU mode regardless of what's available
-# This bypasses the CUDA/cuDNN library requirements
-os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Hide all CUDA devices
-torch.backends.cudnn.enabled = False  # Disable cuDNN
+from collections import deque
+import json
+import webrtcvad  # For voice activity detection

 # Configure environment with longer timeouts
 os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600"  # 10 minutes timeout for downloads
@@ -27,28 +26,92 @@ os.makedirs("models", exist_ok=True)

 app = Flask(__name__)
 app.config['SECRET_KEY'] = 'your-secret-key'
-socketio = SocketIO(app, cors_allowed_origins="*")
+socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')

-# Force CPU regardless of what hardware is available
-device = "cuda" if torch.cuda.is_available() else "cpu"
-whisper_compute_type = "int8"
-print(f"Forcing CPU mode for all models")
+# Explicitly check for CUDA and print detailed info
+print("\n=== CUDA Information ===")
+if torch.cuda.is_available():
+    print(f"CUDA is available")
+    print(f"CUDA version: {torch.version.cuda}")
+    print(f"Number of GPUs: {torch.cuda.device_count()}")
+    for i in range(torch.cuda.device_count()):
+        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+else:
+    print("CUDA is not available")
+
+# Check for cuDNN
+try:
+    import ctypes
+    ctypes.CDLL("libcudnn_ops_infer.so.8")
+    print("cuDNN is available")
+except:
+    print("cuDNN is not available (libcudnn_ops_infer.so.8 not found)")
+
+# Determine compute device
+try:
+    if torch.cuda.is_available():
+        device = "cuda"
+        whisper_compute_type = "float16"
+        print("🟢 CUDA is available and initialized successfully")
+    elif torch.backends.mps.is_available():
+        device = "mps"
+        whisper_compute_type = "float32"
+        print("🟢 MPS is available (Apple Silicon)")
+    else:
+        device = "cpu"
+        whisper_compute_type = "int8"
+        print("🟡 Using CPU (CUDA/MPS not available)")
+except Exception as e:
+    print(f"🔴 Error initializing CUDA: {e}")
+    print("🔴 Falling back to CPU")
+    device = "cpu"
+    whisper_compute_type = "int8"
+
+print(f"Using device: {device}")

 # Initialize models with proper error handling
 whisper_model = None
 csm_generator = None
 llm_model = None
 llm_tokenizer = None
+vad = None
+
+# Constants
+SAMPLE_RATE = 16000  # For VAD
+VAD_FRAME_SIZE = 480  # 30ms at 16kHz for VAD
+VAD_MODE = 3  # Aggressive mode for better results
+AUDIO_CHUNK_SIZE = 2400  # 100ms chunks when streaming AI voice
+
+# Audio sample rates
+CLIENT_SAMPLE_RATE = 44100  # Browser WebAudio default
+WHISPER_SAMPLE_RATE = 16000  # Whisper expects 16kHz
+
+# Session data structures
+user_sessions = {}  # session_id -> complete session data
+
+# WebRTC ICE servers (STUN/TURN servers for NAT traversal)
+ICE_SERVERS = [
+    {"urls": "stun:stun.l.google.com:19302"},
+    {"urls": "stun:stun1.l.google.com:19302"}
+]

 def load_models():
-    global whisper_model, csm_generator, llm_model, llm_tokenizer
+    """Load all necessary models"""
+    global whisper_model, csm_generator, llm_model, llm_tokenizer, vad
+    
+    # Initialize Voice Activity Detector
+    try:
+        vad = webrtcvad.Vad(VAD_MODE)
+        print("Voice Activity Detector initialized")
+    except Exception as e:
+        print(f"Error initializing VAD: {e}")
+        vad = None
    
    # Initialize Faster-Whisper for transcription
    try:
-        print("Loading Whisper model on CPU...")
-        # Import here to avoid immediate import errors if package is missing
+        print("Loading Whisper model...")
        from faster_whisper import WhisperModel
-        whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8", download_root="./models/whisper")
+        whisper_model = WhisperModel("base", device=device, compute_type=whisper_compute_type, download_root="./models/whisper")
        print("Whisper model loaded successfully")
    except Exception as e:
        print(f"Error loading Whisper model: {e}")
@@ -56,8 +119,8 @@ def load_models():
    
    # Initialize CSM model for audio generation
    try:
-        print("Loading CSM model on CPU...")
-        csm_generator = load_csm_1b(device="cpu")
+        print("Loading CSM model...")
+        csm_generator = load_csm_1b(device=device)
        print("CSM model loaded successfully")
    except Exception as e:
        print(f"Error loading CSM model: {e}")
@@ -65,13 +128,14 @@ def load_models():
    
    # Initialize Llama 3.2 model for response generation
    try:
-        print("Loading Llama 3.2 model on CPU...")
-        llm_model_id = "meta-llama/Llama-3.2-1B"  # Choose appropriate size based on resources
+        print("Loading Llama 3.2 model...")
+        llm_model_id = "meta-llama/Llama-3.2-1B"
        llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama")
+        dtype = torch.bfloat16 if device != "cpu" else torch.float32
        llm_model = AutoModelForCausalLM.from_pretrained(
            llm_model_id,
-            torch_dtype=torch.float32,  # Use float32 on CPU
-            device_map="cpu",
+            torch_dtype=dtype,
+            device_map=device,
            cache_dir="./models/llama",
            low_cpu_mem_usage=True
        )
@@ -80,168 +144,344 @@ def load_models():
        print(f"Error loading Llama 3.2 model: {e}")
        print("Will use a fallback response generation method")

-# Store conversation context
-conversation_context = {}  # session_id -> context
-
@app.route('/')
 def index():
+    """Serve the main interface"""
    return render_template('index.html')

+@app.route('/voice-chat.js')
+def voice_chat_js():
+    """Serve the JavaScript for voice chat"""
+    return app.send_static_file('voice-chat.js')
+
@socketio.on('connect')
 def handle_connect():
-    print(f"Client connected: {request.sid}")
-    conversation_context[request.sid] = {
+    """Handle new client connection"""
+    session_id = request.sid
+    print(f"Client connected: {session_id}")
+    
+    # Initialize session data
+    user_sessions[session_id] = {
+        # Conversation context
        'segments': [],
-        'speakers': [0, 1],  # 0 = user, 1 = bot
-        'audio_buffer': deque(maxlen=10),  # Store recent audio chunks
-        'is_speaking': False,
-        'silence_start': None
+        'conversation_history': [],
+        'is_turn_active': False,
+        
+        # Audio buffers and state
+        'vad_buffer': deque(maxlen=30),  # ~1s of audio at 30fps
+        'audio_buffer': bytearray(),
+        'is_user_speaking': False,
+        'last_vad_active': time.time(),
+        'silence_duration': 0,
+        'speech_frames': 0,
+        
+        # AI state
+        'is_ai_speaking': False,
+        'should_interrupt_ai': False,
+        'ai_stream_queue': queue.Queue(),
+        
+        # WebRTC status
+        'webrtc_connected': False,
+        'webrtc_peer_id': None,
+        
+        # Processing flags
+        'is_processing': False,
+        'pending_user_audio': None
    }
-    emit('ready', {'message': 'Connection established'})
+    
+    # Send config to client
+    emit('session_ready', {
+        'whisper_available': whisper_model is not None,
+        'csm_available': csm_generator is not None,
+        'llm_available': llm_model is not None,
+        'client_sample_rate': CLIENT_SAMPLE_RATE,
+        'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
+        'ice_servers': ICE_SERVERS
+    })

@socketio.on('disconnect')
 def handle_disconnect():
-    print(f"Client disconnected: {request.sid}")
-    if request.sid in conversation_context:
-        del conversation_context[request.sid]
+    """Handle client disconnection"""
+    session_id = request.sid
+    print(f"Client disconnected: {session_id}")
    
-@socketio.on('start_speaking')
-def handle_start_speaking():
-    if request.sid in conversation_context:
-        conversation_context[request.sid]['is_speaking'] = True
-        conversation_context[request.sid]['audio_buffer'].clear()
-        print(f"User {request.sid} started speaking")
+    # Clean up resources
+    if session_id in user_sessions:
+        # Signal any running threads to stop
+        user_sessions[session_id]['should_interrupt_ai'] = True
        
-@socketio.on('audio_chunk')
-def handle_audio_chunk(data):
-    if request.sid not in conversation_context:
+        # Clean up resources
+        del user_sessions[session_id]
+
+@socketio.on('webrtc_signal')
+def handle_webrtc_signal(data):
+    """Handle WebRTC signaling for P2P connection establishment"""
+    session_id = request.sid
+    if session_id not in user_sessions:
        return
    
-    context = conversation_context[request.sid]
+    # Simply relay the signal to the client
+    # In a multi-user app, we would route this to the correct peer
+    emit('webrtc_signal', data)

+@socketio.on('webrtc_connected')
+def handle_webrtc_connected(data):
+    """Client notifies that WebRTC connection is established"""
+    session_id = request.sid
+    if session_id not in user_sessions:
+        return
+    
+    user_sessions[session_id]['webrtc_connected'] = True
+    print(f"WebRTC connected for session {session_id}")
+    emit('ready_for_speech', {'message': 'Ready to start conversation'})
+
+@socketio.on('audio_stream')
+def handle_audio_stream(data):
+    """Process incoming audio stream packets from client"""
+    session_id = request.sid
+    if session_id not in user_sessions:
+        return
+    
+    session = user_sessions[session_id]
+    
+    try:
        # Decode audio data
-    audio_data = base64.b64decode(data['audio'])
-    audio_numpy = np.frombuffer(audio_data, dtype=np.float32)
-    audio_tensor = torch.tensor(audio_numpy)
-    
-    # Add to buffer
-    context['audio_buffer'].append(audio_tensor)
-    
-    # Check for silence to detect end of speech
-    if context['is_speaking'] and is_silence(audio_tensor):
-        if context['silence_start'] is None:
-            context['silence_start'] = time.time()
-        elif time.time() - context['silence_start'] > 1.0:  # 1 second of silence
-            # Process the complete utterance
-            process_user_utterance(request.sid)
-    else:
-        context['silence_start'] = None
-
-@socketio.on('stop_speaking')
-def handle_stop_speaking():
-    if request.sid in conversation_context:
-        conversation_context[request.sid]['is_speaking'] = False
-        process_user_utterance(request.sid)
-        print(f"User {request.sid} stopped speaking")
-
-def is_silence(audio_tensor, threshold=0.02):
-    """Check if an audio chunk is silence based on amplitude threshold"""
-    return torch.mean(torch.abs(audio_tensor)) < threshold
-
-def process_user_utterance(session_id):
-    """Process completed user utterance, generate response and send audio back"""
-    context = conversation_context[session_id]
-    
-    if not context['audio_buffer']:
+        audio_bytes = base64.b64decode(data.get('audio', ''))
+        if not audio_bytes or len(audio_bytes) < 2:  # Need at least one sample
            return
        
-    # Combine audio chunks
-    full_audio = torch.cat(list(context['audio_buffer']), dim=0)
-    context['audio_buffer'].clear()
-    context['is_speaking'] = False
-    context['silence_start'] = None
+        # Add to current audio buffer
+        session['audio_buffer'] += audio_bytes
        
-    # Save audio to temporary WAV file for transcription
+        # Check for speech using VAD
+        has_speech = detect_speech(audio_bytes, session_id)
+        
+        # Handle speech state machine
+        if has_speech:
+            # Reset silence tracking when speech is detected
+            session['last_vad_active'] = time.time()
+            session['silence_duration'] = 0
+            session['speech_frames'] += 1
+            
+            # If not already marked as speaking and we have enough speech frames
+            if not session['is_user_speaking'] and session['speech_frames'] >= 5:
+                on_speech_started(session_id)
+        else:
+            # No speech detected in this frame
+            if session['is_user_speaking']:
+                # Calculate silence duration
+                now = time.time()
+                session['silence_duration'] = now - session['last_vad_active']
+                
+                # If silent for more than 0.5 seconds, end speech segment
+                if session['silence_duration'] > 0.8 and session['speech_frames'] > 8:
+                    on_speech_ended(session_id)
+            else:
+                # Not speaking and no speech, just a silent frame
+                session['speech_frames'] = max(0, session['speech_frames'] - 1)
+    
+    except Exception as e:
+        print(f"Error processing audio stream: {e}")
+
+def detect_speech(audio_bytes, session_id):
+    """Use VAD to check if audio contains speech"""
+    if session_id not in user_sessions:
+        return False
+        
+    session = user_sessions[session_id]
+    
+    # Store in VAD buffer for history
+    session['vad_buffer'].append(audio_bytes)
+    
+    if vad is None:
+        # Fallback to simple energy detection
+        audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
+        energy = np.mean(np.abs(audio_data)) / 32768.0
+        return energy > 0.015  # Simple threshold
+    
+    try:
+        # Ensure we have the right amount of data for VAD
+        audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
+        
+        # If we have too much data, use just the right amount
+        if len(audio_data) >= VAD_FRAME_SIZE:
+            frame = audio_data[:VAD_FRAME_SIZE].tobytes()
+            return vad.is_speech(frame, SAMPLE_RATE)
+        
+        # If too little data, accumulate in the VAD buffer and check periodically
+        if len(session['vad_buffer']) >= 3:
+            # Combine recent chunks to get enough data
+            combined = bytearray()
+            for chunk in list(session['vad_buffer'])[-3:]:
+                combined.extend(chunk)
+            
+            # Extract the right amount of data
+            if len(combined) >= VAD_FRAME_SIZE:
+                frame = combined[:VAD_FRAME_SIZE]
+                return vad.is_speech(bytes(frame), SAMPLE_RATE)
+        
+        return False
+    
+    except Exception as e:
+        print(f"VAD error: {e}")
+        return False
+
+def on_speech_started(session_id):
+    """Handle start of user speech"""
+    if session_id not in user_sessions:
+        return
+        
+    session = user_sessions[session_id]
+    
+    # Reset audio buffer 
+    session['audio_buffer'] = bytearray()
+    session['is_user_speaking'] = True
+    session['is_turn_active'] = True
+    
+    # If AI is speaking, we need to interrupt it
+    if session['is_ai_speaking']:
+        session['should_interrupt_ai'] = True
+        emit('ai_interrupted_by_user', room=session_id)
+    
+    # Notify client that we detected speech
+    emit('user_speech_start', room=session_id)
+
+def on_speech_ended(session_id):
+    """Handle end of user speech segment"""
+    if session_id not in user_sessions:
+        return
+        
+    session = user_sessions[session_id]
+    
+    # Mark as not speaking anymore
+    session['is_user_speaking'] = False
+    session['speech_frames'] = 0
+    
+    # If no audio or already processing, skip
+    if len(session['audio_buffer']) < 4000 or session['is_processing']:  # At least 250ms of audio
+        session['audio_buffer'] = bytearray()
+        return
+    
+    # Mark as processing to prevent multiple processes
+    session['is_processing'] = True
+    
+    # Create a copy of the audio buffer
+    audio_copy = session['audio_buffer']
+    session['audio_buffer'] = bytearray()
+    
+    # Convert audio to the format needed for processing
+    try:
+        # Convert to float32 between -1 and 1
+        audio_np = np.frombuffer(audio_copy, dtype=np.int16).astype(np.float32) / 32768.0
+        audio_tensor = torch.from_numpy(audio_np)
+        
+        # Resample to Whisper's expected sample rate if necessary
+        if CLIENT_SAMPLE_RATE != WHISPER_SAMPLE_RATE:
+            audio_tensor = torchaudio.functional.resample(
+                audio_tensor, 
+                orig_freq=CLIENT_SAMPLE_RATE, 
+                new_freq=WHISPER_SAMPLE_RATE
+            )
+        
+        # Save as WAV for transcription
        temp_audio_path = f"temp_audio_{session_id}.wav"
        torchaudio.save(
            temp_audio_path, 
-        full_audio.unsqueeze(0), 
-        44100  # Assuming 44.1kHz from client
+            audio_tensor.unsqueeze(0), 
+            WHISPER_SAMPLE_RATE
        )
        
-    try:
-        # Try using Whisper first if available
-        if whisper_model is not None:
-            user_text = transcribe_with_whisper(temp_audio_path)
-        else:
-            # Fallback to Google's speech recognition
-            user_text = transcribe_with_google(temp_audio_path)
+        # Start transcription and response process in a thread
+        threading.Thread(
+            target=process_user_utterance,
+            args=(session_id, temp_audio_path, audio_tensor),
+            daemon=True
+        ).start()
        
-        if not user_text:
-            print("No speech detected.")
-            emit('error', {'message': 'No speech detected. Please try again.'}, room=session_id)
+        # Notify client that processing has started
+        emit('processing_speech', room=session_id)
+    
+    except Exception as e:
+        print(f"Error preparing audio: {e}")
+        session['is_processing'] = False
+        emit('error', {'message': f'Error processing audio: {str(e)}'}, room=session_id)
+
+def process_user_utterance(session_id, audio_path, audio_tensor):
+    """Process user utterance, transcribe and generate response"""
+    if session_id not in user_sessions:
+        return
+    
+    session = user_sessions[session_id]
+    
+    try:
+        # Transcribe audio
+        if whisper_model is not None:
+            user_text = transcribe_with_whisper(audio_path)
+        else:
+            # Fallback to another transcription service
+            user_text = transcribe_fallback(audio_path)
+        
+        # Clean up temp file
+        if os.path.exists(audio_path):
+            os.remove(audio_path)
+        
+        # Check if we got meaningful text
+        if not user_text or len(user_text.strip()) < 2:
+            emit('no_speech_detected', room=session_id)
+            session['is_processing'] = False
            return
        
        print(f"Transcribed: {user_text}")
        
-        # Add to conversation segments
+        # Create user segment
        user_segment = Segment(
            text=user_text,
            speaker=0,  # User is speaker 0
-            audio=full_audio
+            audio=audio_tensor
        )
-        context['segments'].append(user_segment)
+        session['segments'].append(user_segment)
        
-        # Generate bot response
-        bot_response = generate_llm_response(user_text, context['segments'])
-        print(f"Bot response: {bot_response}")
+        # Update conversation history
+        session['conversation_history'].append({
+            'role': 'user',
+            'text': user_text
+        })
        
-        # Send transcribed text to client
+        # Send transcription to client
        emit('transcription', {'text': user_text}, room=session_id)
        
-        # Generate and send audio response if CSM is available
+        # Generate AI response
+        ai_response = generate_ai_response(user_text, session_id)
+        
+        # Send text response to client
+        emit('ai_response_text', {'text': ai_response}, room=session_id)
+        
+        # Update conversation history
+        session['conversation_history'].append({
+            'role': 'assistant',
+            'text': ai_response
+        })
+        
+        # Generate voice response if CSM is available
        if csm_generator is not None:
-            # Convert to audio using CSM
-            bot_audio = generate_audio_response(bot_response, context['segments'])
+            session['is_ai_speaking'] = True
+            session['should_interrupt_ai'] = False
            
-            # Convert audio to base64 for sending over websocket
-            audio_bytes = io.BytesIO()
-            torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav")
-            audio_bytes.seek(0)
-            audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
-            
-            # Add bot response to conversation history
-            bot_segment = Segment(
-                text=bot_response,
-                speaker=1,  # Bot is speaker 1
-                audio=bot_audio
-            )
-            context['segments'].append(bot_segment)
-            
-            # Send audio response to client
-            emit('audio_response', {
-                'audio': audio_b64,
-                'text': bot_response
-            }, room=session_id)
-        else:
-            # Send text-only response if audio generation isn't available
-            emit('text_response', {'text': bot_response}, room=session_id)
-            
-            # Add text-only bot response to conversation history
-            bot_segment = Segment(
-                text=bot_response,
-                speaker=1,  # Bot is speaker 1
-                audio=torch.zeros(1)  # Placeholder empty audio
-            )
-            context['segments'].append(bot_segment)
+            # Begin streaming audio response
+            threading.Thread(
+                target=stream_ai_response,
+                args=(ai_response, session_id),
+                daemon=True
+            ).start()
    
    except Exception as e:
-        print(f"Error processing speech: {e}")
-        emit('error', {'message': f'Error processing speech: {str(e)}'}, room=session_id)
+        print(f"Error processing utterance: {e}")
+        emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
+    
    finally:
-        # Cleanup temp file
-        if os.path.exists(temp_audio_path):
-            os.remove(temp_audio_path)
+        # Clear processing flag
+        if session_id in user_sessions:
+            session['is_processing'] = False

 def transcribe_with_whisper(audio_path):
    """Transcribe audio using Faster-Whisper"""
@@ -250,14 +490,13 @@ def transcribe_with_whisper(audio_path):
    # Collect all text from segments
    user_text = ""
    for segment in segments:
-        segment_text = segment.text.strip()
-        print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment_text}")
-        user_text += segment_text + " "
+        user_text += segment.text.strip() + " "
    
    return user_text.strip()

-def transcribe_with_google(audio_path):
+def transcribe_fallback(audio_path):
    """Fallback transcription using Google's speech recognition"""
+    try:
        import speech_recognition as sr
        recognizer = sr.Recognizer()
        
@@ -269,28 +508,40 @@ def transcribe_with_google(audio_path):
            except sr.UnknownValueError:
                return ""
            except sr.RequestError:
-            # If Google API fails, try a basic energy-based VAD approach
-            # This is a very basic fallback and won't give good results
-            return "[Speech detected but transcription failed]"
+                return "[Speech recognition service unavailable]"
+    except ImportError:
+        return "[Speech recognition not available]"
+
+def generate_ai_response(user_text, session_id):
+    """Generate text response using available LLM"""
+    if session_id not in user_sessions:
+        return "I'm sorry, your session has expired."
+    
+    session = user_sessions[session_id]
    
-def generate_llm_response(user_text, conversation_segments):
-    """Generate text response using available model"""
    if llm_model is not None and llm_tokenizer is not None:
        # Format conversation history for the LLM
-        conversation_history = ""
-        for segment in conversation_segments[-5:]:  # Use last 5 utterances for context
-            speaker_name = "User" if segment.speaker == 0 else "Assistant"
-            conversation_history += f"{speaker_name}: {segment.text}\n"
+        prompt = "You are a helpful, friendly voice assistant. Keep your responses brief and conversational.\n\n"
        
-        # Add the current user query
-        conversation_history += f"User: {user_text}\nAssistant:"
+        # Add recent conversation history (last 6 turns maximum)
+        for entry in session['conversation_history'][-6:]:
+            if entry['role'] == 'user':
+                prompt += f"User: {entry['text']}\n"
+            else:
+                prompt += f"Assistant: {entry['text']}\n"
+        
+        # Add current query if not already in history
+        if not session['conversation_history'] or session['conversation_history'][-1]['role'] != 'user':
+            prompt += f"User: {user_text}\n"
+        
+        prompt += "Assistant: "
        
        try:
            # Generate response
-            inputs = llm_tokenizer(conversation_history, return_tensors="pt").to(device)
+            inputs = llm_tokenizer(prompt, return_tensors="pt").to(device)
            output = llm_model.generate(
                inputs.input_ids, 
-                max_new_tokens=150,
+                max_new_tokens=100,  # Keep responses shorter for voice
                temperature=0.7,
                top_p=0.9,
                do_sample=True
@@ -298,40 +549,48 @@ def generate_llm_response(user_text, conversation_segments):
            
            response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
            return response.strip()
+        
        except Exception as e:
-            print(f"Error generating response with LLM: {e}")
+            print(f"Error generating LLM response: {e}")
            return fallback_response(user_text)
    else:
        return fallback_response(user_text)

 def fallback_response(user_text):
-    """Generate a simple fallback response when LLM is not available"""
-    # Simple rule-based responses
+    """Generate simple fallback responses when LLM is unavailable"""
    user_text_lower = user_text.lower()
    
    if "hello" in user_text_lower or "hi" in user_text_lower:
-        return "Hello! I'm a simple fallback assistant. The main language model couldn't be loaded, so I have limited capabilities."
+        return "Hello! How can I help you today?"
    
    elif "how are you" in user_text_lower:
-        return "I'm functioning within my limited capabilities. How can I assist you today?"
+        return "I'm doing well, thanks for asking! How about you?"
    
    elif "thank" in user_text_lower:
-        return "You're welcome! Let me know if there's anything else I can help with."
+        return "You're welcome! Happy to help."
    
    elif "bye" in user_text_lower or "goodbye" in user_text_lower:
        return "Goodbye! Have a great day!"
    
    elif any(q in user_text_lower for q in ["what", "who", "where", "when", "why", "how"]):
-        return "I'm running in fallback mode and can't answer complex questions. Please try again when the main language model is available."
+        return "That's an interesting question. I wish I could provide a better answer in my current fallback mode."
        
    else:
-        return "I understand you said something about that. Unfortunately, I'm running in fallback mode with limited capabilities. Please try again later when the main model is available."
+        return "I see. Tell me more about that."
+
+def stream_ai_response(text, session_id):
+    """Generate and stream audio response in real-time chunks"""
+    if session_id not in user_sessions:
+        return
+    
+    session = user_sessions[session_id]
    
-def generate_audio_response(text, conversation_segments):
-    """Generate audio response using CSM"""
    try:
-        # Use the last few conversation segments as context
-        context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments
+        # Signal start of AI speech
+        emit('ai_speech_start', room=session_id)
+        
+        # Use the last few conversation segments as context (up to 4)
+        context_segments = session['segments'][-4:] if len(session['segments']) > 4 else session['segments']
        
        # Generate audio for bot response
        audio = csm_generator.generate(
@@ -343,11 +602,77 @@ def generate_audio_response(text, conversation_segments):
            topk=50
        )
        
-        return audio
+        # Create and store bot segment
+        bot_segment = Segment(
+            text=text,
+            speaker=1,
+            audio=audio
+        )
+        
+        if session_id in user_sessions:
+            session['segments'].append(bot_segment)
+        
+        # Stream audio in small chunks for more responsive playback
+        chunk_size = AUDIO_CHUNK_SIZE  # Size defined in constants
+        
+        for i in range(0, len(audio), chunk_size):
+            # Check if we should stop (user interrupted)
+            if session_id not in user_sessions or session['should_interrupt_ai']:
+                print("AI speech interrupted")
+                break
+            
+            # Get next chunk
+            chunk = audio[i:i+chunk_size]
+            
+            # Convert audio chunk to base64 for streaming
+            audio_bytes = io.BytesIO()
+            torchaudio.save(audio_bytes, chunk.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav")
+            audio_bytes.seek(0)
+            audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
+            
+            # Send chunk to client
+            socketio.emit('ai_speech_chunk', {
+                'audio': audio_b64,
+                'is_last': i + chunk_size >= len(audio)
+            }, room=session_id)
+            
+            # Small sleep for more natural pacing
+            time.sleep(0.06)  # Slight delay for smoother playback
+        
+        # Signal end of AI speech
+        if session_id in user_sessions:
+            session['is_ai_speaking'] = False
+            session['is_turn_active'] = False  # End conversation turn
+            socketio.emit('ai_speech_end', room=session_id)
+    
    except Exception as e:
-        print(f"Error generating audio: {e}")
-        # Return silence as fallback
-        return torch.zeros(csm_generator.sample_rate * 3)  # 3 seconds of silence
+        print(f"Error streaming AI response: {e}")
+        if session_id in user_sessions:
+            session['is_ai_speaking'] = False
+            session['is_turn_active'] = False
+            socketio.emit('error', {'message': f'Error generating audio: {str(e)}'}, room=session_id)
+            socketio.emit('ai_speech_end', room=session_id)
+
+@socketio.on('interrupt_ai')
+def handle_interrupt():
+    """Handle explicit AI interruption request from client"""
+    session_id = request.sid
+    if session_id in user_sessions:
+        user_sessions[session_id]['should_interrupt_ai'] = True
+        emit('ai_interrupted', room=session_id)
+
+@socketio.on('get_config')
+def handle_get_config():
+    """Send configuration to client"""
+    session_id = request.sid
+    if session_id in user_sessions:
+        emit('config', {
+            'client_sample_rate': CLIENT_SAMPLE_RATE,
+            'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
+            'whisper_available': whisper_model is not None,
+            'csm_available': csm_generator is not None,
+            'ice_servers': ICE_SERVERS
+        })

 if __name__ == '__main__':
    # Ensure the existing index.html file is in the correct location
@@ -357,9 +682,8 @@ if __name__ == '__main__':
    if os.path.exists('index.html') and not os.path.exists('templates/index.html'):
        os.rename('index.html', 'templates/index.html')
    
-    # Load models asynchronously before starting the server
-    print("Starting CPU-only model loading...")
-    # In a production environment, you could load models in a separate thread
+    # Load models before starting the server
+    print("Starting model loading...")
    load_models()
    
    # Start the server
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -0,0 +1,560 @@
+document.addEventListener('DOMContentLoaded', () => {
+    // DOM Elements
+    const startButton = document.getElementById('start-button');
+    const interruptButton = document.getElementById('interrupt-button');
+    const conversationDiv = document.getElementById('conversation');
+    const connectionDot = document.getElementById('connection-dot');
+    const connectionStatus = document.getElementById('connection-status');
+    const whisperStatus = document.getElementById('whisper-status');
+    const csmStatus = document.getElementById('csm-status');
+    const llmStatus = document.getElementById('llm-status');
+    const webrtcStatus = document.getElementById('webrtc-status');
+    const micAnimation = document.getElementById('mic-animation');
+    const loadingDiv = document.getElementById('loading');
+    const loadingText = document.getElementById('loading-text');
+    
+    // State variables
+    let socket;
+    let isConnected = false;
+    let isListening = false;
+    let isAiSpeaking = false;
+    let audioContext;
+    let mediaStream;
+    let audioRecorder;
+    let audioProcessor;
+    const audioChunks = [];
+    
+    // WebRTC variables
+    let peerConnection;
+    let dataChannel;
+    let hasActiveConnection = false;
+    
+    // Audio playback
+    let audioQueue = [];
+    let isPlaying = false;
+    
+    // Configuration variables
+    let serverSampleRate = 24000;
+    let clientSampleRate = 44100;
+    let iceServers = [];
+    
+    // Initialize the application
+    initApp();
+    
+    // Main initialization function
+    function initApp() {
+        updateConnectionStatus('connecting');
+        setupSocketConnection();
+        setupEventListeners();
+    }
+    
+    // Set up Socket.IO connection with server
+    function setupSocketConnection() {
+        socket = io();
+        
+        socket.on('connect', () => {
+            console.log('Connected to server');
+            updateConnectionStatus('connected');
+            isConnected = true;
+        });
+        
+        socket.on('disconnect', () => {
+            console.log('Disconnected from server');
+            updateConnectionStatus('disconnected');
+            isConnected = false;
+            cleanupAudio();
+            cleanupWebRTC();
+        });
+        
+        socket.on('session_ready', (data) => {
+            console.log('Session ready:', data);
+            updateModelStatus(data);
+            clientSampleRate = data.client_sample_rate;
+            serverSampleRate = data.server_sample_rate;
+            iceServers = data.ice_servers;
+            
+            // Initialize WebRTC if models are available
+            if (data.whisper_available && data.llm_available) {
+                initializeWebRTC();
+            }
+        });
+        
+        socket.on('ready_for_speech', (data) => {
+            console.log('Ready for speech:', data);
+            startButton.disabled = false;
+            addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
+        });
+        
+        socket.on('webrtc_signal', (data) => {
+            handleWebRTCSignal(data);
+        });
+        
+        socket.on('transcription', (data) => {
+            console.log('Transcription:', data);
+            addUserMessage(data.text);
+            loadingDiv.style.display = 'none';
+        });
+        
+        socket.on('ai_response_text', (data) => {
+            console.log('AI response text:', data);
+            addAIMessage(data.text);
+            loadingDiv.style.display = 'none';
+        });
+        
+        socket.on('ai_speech_start', () => {
+            console.log('AI started speaking');
+            isAiSpeaking = true;
+            interruptButton.disabled = false;
+        });
+        
+        socket.on('ai_speech_chunk', (data) => {
+            console.log('Received AI speech chunk');
+            playAudioChunk(data.audio, data.is_last);
+        });
+        
+        socket.on('ai_speech_end', () => {
+            console.log('AI stopped speaking');
+            isAiSpeaking = false;
+            interruptButton.disabled = true;
+        });
+        
+        socket.on('user_speech_start', () => {
+            console.log('User speech detected');
+            showSpeakingIndicator(true);
+        });
+        
+        socket.on('processing_speech', () => {
+            console.log('Processing speech');
+            showSpeakingIndicator(false);
+            showLoadingIndicator('Processing your speech...');
+        });
+        
+        socket.on('no_speech_detected', () => {
+            console.log('No speech detected');
+            hideLoadingIndicator();
+            addInfoMessage('No speech detected. Please try again.');
+        });
+        
+        socket.on('ai_interrupted', () => {
+            console.log('AI interrupted');
+            clearAudioQueue();
+            isAiSpeaking = false;
+            interruptButton.disabled = true;
+        });
+        
+        socket.on('ai_interrupted_by_user', () => {
+            console.log('AI interrupted by user');
+            clearAudioQueue();
+            isAiSpeaking = false;
+            interruptButton.disabled = true;
+            addInfoMessage('AI interrupted by your speech');
+        });
+        
+        socket.on('error', (data) => {
+            console.error('Server error:', data);
+            hideLoadingIndicator();
+            addInfoMessage(`Error: ${data.message}`);
+        });
+    }
+    
+    // Set up UI event listeners
+    function setupEventListeners() {
+        startButton.addEventListener('click', toggleListening);
+        interruptButton.addEventListener('click', interruptAI);
+    }
+    
+    // Update UI connection status
+    function updateConnectionStatus(status) {
+        connectionDot.className = 'status-dot ' + status;
+        
+        switch (status) {
+            case 'connected':
+                connectionStatus.textContent = 'Connected';
+                break;
+            case 'connecting':
+                connectionStatus.textContent = 'Connecting...';
+                break;
+            case 'disconnected':
+                connectionStatus.textContent = 'Disconnected';
+                startButton.disabled = true;
+                interruptButton.disabled = true;
+                break;
+        }
+    }
+    
+    // Update model status indicators
+    function updateModelStatus(data) {
+        whisperStatus.textContent = data.whisper_available ? 'Available' : 'Not Available';
+        whisperStatus.style.color = data.whisper_available ? 'green' : 'red';
+        
+        csmStatus.textContent = data.csm_available ? 'Available' : 'Not Available';
+        csmStatus.style.color = data.csm_available ? 'green' : 'red';
+        
+        llmStatus.textContent = data.llm_available ? 'Available' : 'Not Available';
+        llmStatus.style.color = data.llm_available ? 'green' : 'red';
+    }
+    
+    // Initialize WebRTC connection
+    function initializeWebRTC() {
+        if (!isConnected) return;
+        
+        const configuration = {
+            iceServers: iceServers
+        };
+        
+        peerConnection = new RTCPeerConnection(configuration);
+        
+        // Create data channel for WebRTC communication
+        dataChannel = peerConnection.createDataChannel('audioData', {
+            ordered: true
+        });
+        
+        dataChannel.onopen = () => {
+            console.log('WebRTC data channel open');
+            hasActiveConnection = true;
+            webrtcStatus.textContent = 'Connected';
+            webrtcStatus.style.color = 'green';
+            socket.emit('webrtc_connected', { status: 'connected' });
+        };
+        
+        dataChannel.onclose = () => {
+            console.log('WebRTC data channel closed');
+            hasActiveConnection = false;
+            webrtcStatus.textContent = 'Disconnected';
+            webrtcStatus.style.color = 'red';
+        };
+        
+        // Handle ICE candidates
+        peerConnection.onicecandidate = (event) => {
+            if (event.candidate) {
+                socket.emit('webrtc_signal', {
+                    type: 'ice_candidate',
+                    candidate: event.candidate
+                });
+            }
+        };
+        
+        // Log ICE connection state changes
+        peerConnection.oniceconnectionstatechange = () => {
+            console.log('ICE connection state:', peerConnection.iceConnectionState);
+        };
+        
+        // Create offer
+        peerConnection.createOffer()
+            .then(offer => peerConnection.setLocalDescription(offer))
+            .then(() => {
+                socket.emit('webrtc_signal', {
+                    type: 'offer',
+                    sdp: peerConnection.localDescription
+                });
+            })
+            .catch(error => {
+                console.error('Error creating WebRTC offer:', error);
+                webrtcStatus.textContent = 'Failed to Connect';
+                webrtcStatus.style.color = 'red';
+            });
+    }
+    
+    // Handle WebRTC signals from the server
+    function handleWebRTCSignal(data) {
+        if (!peerConnection) return;
+        
+        if (data.type === 'answer') {
+            peerConnection.setRemoteDescription(new RTCSessionDescription(data.sdp))
+                .catch(error => console.error('Error setting remote description:', error));
+        } 
+        else if (data.type === 'ice_candidate') {
+            peerConnection.addIceCandidate(new RTCIceCandidate(data.candidate))
+                .catch(error => console.error('Error adding ICE candidate:', error));
+        }
+    }
+    
+    // Clean up WebRTC connection
+    function cleanupWebRTC() {
+        if (dataChannel) {
+            dataChannel.close();
+        }
+        
+        if (peerConnection) {
+            peerConnection.close();
+        }
+        
+        dataChannel = null;
+        peerConnection = null;
+        hasActiveConnection = false;
+        webrtcStatus.textContent = 'Not Connected';
+        webrtcStatus.style.color = 'red';
+    }
+    
+    // Toggle audio listening
+    function toggleListening() {
+        if (isListening) {
+            stopListening();
+        } else {
+            startListening();
+        }
+    }
+    
+    // Start listening for audio
+    async function startListening() {
+        if (!isConnected) return;
+        
+        try {
+            await initAudio();
+            isListening = true;
+            startButton.textContent = 'Stop Listening';
+            startButton.innerHTML = `
+                <svg class="button-icon" viewBox="0 0 24 24" fill="white">
+                    <path d="M6 6h12v12H6z"></path>
+                </svg>
+                Stop Listening
+            `;
+        } catch (error) {
+            console.error('Error starting audio:', error);
+            addInfoMessage('Error accessing microphone. Please check permissions.');
+        }
+    }
+    
+    // Stop listening for audio
+    function stopListening() {
+        cleanupAudio();
+        isListening = false;
+        startButton.innerHTML = `
+            <svg class="button-icon" viewBox="0 0 24 24" fill="white">
+                <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
+            </svg>
+            Start Listening
+        `;
+        showSpeakingIndicator(false);
+    }
+    
+    // Initialize audio capture
+    async function initAudio() {
+        // Request microphone access
+        mediaStream = await navigator.mediaDevices.getUserMedia({
+            audio: {
+                sampleRate: clientSampleRate,
+                channelCount: 1,
+                echoCancellation: true,
+                noiseSuppression: true,
+                autoGainControl: true
+            }
+        });
+        
+        // Initialize AudioContext
+        audioContext = new (window.AudioContext || window.webkitAudioContext)({
+            sampleRate: clientSampleRate
+        });
+        
+        // Create audio source from stream
+        const source = audioContext.createMediaStreamSource(mediaStream);
+        
+        // Create ScriptProcessor for audio processing
+        const bufferSize = 4096;
+        audioProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
+        
+        // Process audio data
+        audioProcessor.onaudioprocess = (event) => {
+            if (!isListening || isAiSpeaking) return;
+            
+            const input = event.inputBuffer.getChannelData(0);
+            const audioData = convertFloat32ToInt16(input);
+            sendAudioChunk(audioData);
+        };
+        
+        // Connect the nodes
+        source.connect(audioProcessor);
+        audioProcessor.connect(audioContext.destination);
+    }
+    
+    // Clean up audio resources
+    function cleanupAudio() {
+        if (audioProcessor) {
+            audioProcessor.disconnect();
+            audioProcessor = null;
+        }
+        
+        if (mediaStream) {
+            mediaStream.getTracks().forEach(track => track.stop());
+            mediaStream = null;
+        }
+        
+        if (audioContext && audioContext.state !== 'closed') {
+            audioContext.close().catch(error => console.error('Error closing AudioContext:', error));
+        }
+        
+        audioChunks.length = 0;
+    }
+    
+    // Convert Float32Array to Int16Array for sending to server
+    function convertFloat32ToInt16(float32Array) {
+        const int16Array = new Int16Array(float32Array.length);
+        for (let i = 0; i < float32Array.length; i++) {
+            // Convert float [-1.0, 1.0] to int16 [-32768, 32767]
+            int16Array[i] = Math.max(-32768, Math.min(32767, Math.floor(float32Array[i] * 32768)));
+        }
+        return int16Array;
+    }
+    
+    // Send audio chunk to server
+    function sendAudioChunk(audioData) {
+        if (!isConnected || !isListening) return;
+        
+        // Convert to base64 for transmission
+        const base64Audio = arrayBufferToBase64(audioData.buffer);
+        
+        // Send via Socket.IO (could use WebRTC's DataChannel for lower latency in production)
+        socket.emit('audio_stream', { audio: base64Audio });
+    }
+    
+    // Play audio chunk received from server
+    function playAudioChunk(base64Audio, isLast) {
+        const audioData = base64ToArrayBuffer(base64Audio);
+        
+        // Add to queue
+        audioQueue.push({
+            data: audioData,
+            isLast: isLast
+        });
+        
+        // Start playing if not already playing
+        if (!isPlaying) {
+            playNextAudioChunk();
+        }
+    }
+    
+    // Play the next audio chunk in the queue
+    function playNextAudioChunk() {
+        if (audioQueue.length === 0) {
+            isPlaying = false;
+            return;
+        }
+        
+        isPlaying = true;
+        const chunk = audioQueue.shift();
+        
+        try {
+            // Create audio context if needed
+            if (!audioContext || audioContext.state === 'closed') {
+                audioContext = new (window.AudioContext || window.webkitAudioContext)();
+            }
+            
+            // Resume audio context if suspended
+            if (audioContext.state === 'suspended') {
+                audioContext.resume();
+            }
+            
+            // Decode the WAV data
+            audioContext.decodeAudioData(chunk.data, (buffer) => {
+                const source = audioContext.createBufferSource();
+                source.buffer = buffer;
+                source.connect(audioContext.destination);
+                
+                // When playback ends, play the next chunk
+                source.onended = () => {
+                    playNextAudioChunk();
+                };
+                
+                source.start(0);
+                
+                // If it's the last chunk, update UI
+                if (chunk.isLast) {
+                    setTimeout(() => {
+                        isAiSpeaking = false;
+                        interruptButton.disabled = true;
+                    }, buffer.duration * 1000);
+                }
+            }, (error) => {
+                console.error('Error decoding audio data:', error);
+                playNextAudioChunk(); // Skip this chunk and try the next
+            });
+        } catch (error) {
+            console.error('Error playing audio chunk:', error);
+            playNextAudioChunk(); // Try the next chunk
+        }
+    }
+    
+    // Clear the audio queue (used when interrupting)
+    function clearAudioQueue() {
+        audioQueue.length = 0;
+        isPlaying = false;
+        
+        // Stop any currently playing audio
+        if (audioContext) {
+            audioContext.suspend();
+        }
+    }
+    
+    // Send interrupt signal to server
+    function interruptAI() {
+        if (!isConnected || !isAiSpeaking) return;
+        
+        socket.emit('interrupt_ai');
+        clearAudioQueue();
+    }
+    
+    // Convert ArrayBuffer to Base64 string
+    function arrayBufferToBase64(buffer) {
+        const binary = new Uint8Array(buffer);
+        let base64 = '';
+        const len = binary.byteLength;
+        for (let i = 0; i < len; i++) {
+            base64 += String.fromCharCode(binary[i]);
+        }
+        return window.btoa(base64);
+    }
+    
+    // Convert Base64 string to ArrayBuffer
+    function base64ToArrayBuffer(base64) {
+        const binaryString = window.atob(base64);
+        const len = binaryString.length;
+        const bytes = new Uint8Array(len);
+        for (let i = 0; i < len; i++) {
+            bytes[i] = binaryString.charCodeAt(i);
+        }
+        return bytes.buffer;
+    }
+    
+    // Add user message to conversation
+    function addUserMessage(text) {
+        const messageDiv = document.createElement('div');
+        messageDiv.className = 'message user-message';
+        messageDiv.textContent = text;
+        conversationDiv.appendChild(messageDiv);
+        conversationDiv.scrollTop = conversationDiv.scrollHeight;
+    }
+    
+    // Add AI message to conversation
+    function addAIMessage(text) {
+        const messageDiv = document.createElement('div');
+        messageDiv.className = 'message ai-message';
+        messageDiv.textContent = text;
+        conversationDiv.appendChild(messageDiv);
+        conversationDiv.scrollTop = conversationDiv.scrollHeight;
+    }
+    
+    // Add info message to conversation
+    function addInfoMessage(text) {
+        const messageDiv = document.createElement('div');
+        messageDiv.className = 'info-message';
+        messageDiv.textContent = text;
+        conversationDiv.appendChild(messageDiv);
+        conversationDiv.scrollTop = conversationDiv.scrollHeight;
+    }
+    
+    // Show/hide speaking indicator
+    function showSpeakingIndicator(show) {
+        micAnimation.style.display = show ? 'flex' : 'none';
+    }
+    
+    // Show loading indicator
+    function showLoadingIndicator(text) {
+        loadingText.textContent = text || 'Processing...';
+        loadingDiv.style.display = 'block';
+    }
+    
+    // Hide loading indicator
+    function hideLoadingIndicator() {
+        loadingDiv.style.display = 'none';
+    }
+});
--- a/React/public/icon-128x128.png
+++ b/React/public/icon-128x128.png
--- a/React/public/icon-512x512.png
+++ b/React/public/icon-512x512.png
--- a/React/src/app/layout.tsx
+++ b/React/src/app/layout.tsx
@@ -13,8 +13,8 @@ const geistMono = Geist_Mono({
 });

 export const metadata: Metadata = {
-	title: "Create Next App",
-	description: "Generated by create next app",
+	title: "Fauxcall",
+	description: "Fauxcall is a fake call app that helps you get out of awkward situations.",
 };

 export default function RootLayout({
--- a/React/src/app/manifest.ts
+++ b/React/src/app/manifest.ts
@@ -0,0 +1,25 @@
+import type { MetadataRoute } from 'next'
+ 
+export default function manifest(): MetadataRoute.Manifest {
+  return {
+    name: 'Fauxcall',
+    short_name: 'Fauxcall',
+    description: 'A fake call app that helps you get out of awkward and dangerous situations.',
+    start_url: '/',
+    display: 'standalone',
+    background_color: '#ffffff',
+    theme_color: '#000000',
+    icons: [
+      {
+        src: '/icon-192x192.png',
+        sizes: '192x192',
+        type: 'image/png',
+      },
+      {
+        src: '/icon-512x512.png',
+        sizes: '512x512',
+        type: 'image/png',
+      },
+    ],
+  }
+}
--- a/React/src/app/page.tsx
+++ b/React/src/app/page.tsx
@@ -4,7 +4,7 @@ import { useRouter } from "next/navigation";
 import './styles.css';

 export default function Home() {
-	const [contacts, setContacts] = useState<string[]>([]);
+	const [contacts, setContacts] = useState<string[]>([""]);
 	const [codeword, setCodeword] = useState("");
 	const [session, setSession] = useState<any>(null);
 	const [loading, setLoading] = useState(true);
@@ -26,6 +26,16 @@ export default function Home() {
 			});
 	}, []);

+	const handleInputChange = (index: number, value: string) => {
+		const updatedContacts = [...contacts];
+		updatedContacts[index] = value; // Update the specific input value
+		setContacts(updatedContacts);
+	};
+
+	const addContactInput = () => {
+		setContacts([...contacts, ""]); // Add a new empty input
+	};
+
 	function saveToDB() {
 		alert("Saving contacts...");
 		const contactInputs = document.querySelectorAll(
@@ -144,27 +154,20 @@ export default function Home() {
 					className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start"
 					onSubmit={(e) => e.preventDefault()}
 				>
+					{contacts.map((contact, index) => (
 						<input
+							key={index}
 							type="text"
-						value={contacts}
-						onChange={(e) => setContacts(e.target.value.split(","))}
-						placeholder="Write down an emergency contact"
+							value={contact}
+							onChange={(e) => handleInputChange(index, e.target.value)}
+							placeholder={`Contact ${index + 1}`}
 							className="border border-gray-300 rounded-md p-2"
 						/>
-
-
-
+					))}
 					<button
-						onClick={() => {
-							alert("Adding contact...");
-							let elem = document.getElementsByClassName(
-								"text-input"
-							)[0] as HTMLElement;
-							console.log("Element:", elem);
-							let d = elem.cloneNode(true) as HTMLElement;
-							document.getElementById("Contacts")?.appendChild(d);
-						}}
-						className="bg-emerald-500 text-fuchsia-300"
+						onClick={addContactInput}
+						className="bg-emerald-500 text-white
+						font-semibold font-lg rounded-md p-2"
 						type="button"
 					>
 						Add