merge

2025-03-30 09:27:10 -04:00
parent 842496c053 d818c86463
commit 9f10ba895c
8 changed files with 1356 additions and 632 deletions
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -3,454 +3,266 @@
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Voice Assistant - CSM & Whisper</title>
+    <title>AI Voice Chat</title>
    <script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            margin: 0;
            padding: 0;
            background-color: #f5f5f5;
            color: #333;
        }
        .container {
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f5f7fa;
            color: #333;
        }
-        
+        header {
        h1 {
            color: #2c3e50;
            text-align: center;
            margin-bottom: 30px;
        }
-        
+        h1 {
-        #conversation {
+            color: #2c3e50;
            height: 400px;
            border: 1px solid #ddd;
            border-radius: 10px;
            padding: 20px;
            margin-bottom: 20px;
            overflow-y: auto;
            background-color: white;
            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
        }
-        
+        .status-bar {
-        .message-container {
+            background-color: #2c3e50;
            display: flex;
            flex-direction: column;
            margin-bottom: 15px;
        }
        .user-message-container {
            align-items: flex-end;
        }
        .bot-message-container {
            align-items: flex-start;
        }
        .message {
            max-width: 80%;
            padding: 12px;
            border-radius: 18px;
            position: relative;
            word-break: break-word;
        }
        .user-message {
            background-color: #dcf8c6;
            color: #000;
            border-bottom-right-radius: 4px;
        }
        .bot-message {
            background-color: #f1f0f0;
            color: #000;
            border-bottom-left-radius: 4px;
        }
        .message-label {
            font-size: 0.8em;
            margin-bottom: 4px;
            color: #657786;
        }
        #controls {
            display: flex;
            gap: 10px;
            justify-content: center;
            margin-bottom: 15px;
        }
        button {
            padding: 12px 24px;
            font-size: 16px;
            cursor: pointer;
            border-radius: 50px;
            border: none;
            outline: none;
            transition: all 0.3s ease;
        }
        #recordButton {
            background-color: #4CAF50;
            color: white;
-            width: 200px;
+            padding: 10px;
-            box-shadow: 0 4px 8px rgba(76, 175, 80, 0.3);
+            border-radius: 5px;
            margin-bottom: 20px;
            display: flex;
            justify-content: space-between;
            align-items: center;
        }
-        
+        .status-indicator {
-        #recordButton:hover {
+            display: flex;
-            background-color: #45a049;
+            align-items: center;
            transform: translateY(-2px);
        }
        .status-dot {
            height: 10px;
            width: 10px;
            border-radius: 50%;
            margin-right: 8px;
        }
        .status-dot.connected { background-color: #2ecc71; }
        .status-dot.connecting { background-color: #f39c12; }
        .status-dot.disconnected { background-color: #e74c3c; }
-        #recordButton.recording {
+        .conversation {
-            background-color: #f44336;
+            background-color: white;
            border-radius: 10px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
            height: 400px;
            padding: 20px;
            overflow-y: auto;
            margin-bottom: 20px;
        }
        .message {
            margin-bottom: 15px;
            padding: 10px 15px;
            border-radius: 18px;
            max-width: 80%;
            word-wrap: break-word;
        }
        .user-message {
            background-color: #e3f2fd;
            margin-left: auto;
            border-bottom-right-radius: 5px;
        }
        .ai-message {
            background-color: #f0f0f0;
            margin-right: auto;
            border-bottom-left-radius: 5px;
        }
        .controls {
            display: flex;
            justify-content: center;
            gap: 15px;
            margin-bottom: 20px;
        }
        button {
            background-color: #2c3e50;
            color: white;
            border: none;
            padding: 12px 24px;
            border-radius: 25px;
            cursor: pointer;
            font-size: 16px;
            transition: all 0.2s;
            display: flex;
            align-items: center;
            justify-content: center;
            gap: 8px;
        }
        button:hover {
            background-color: #1a252f;
        }
        button:disabled {
            background-color: #95a5a6;
            cursor: not-allowed;
        }
        .button-icon {
            width: 20px;
            height: 20px;
        }
        .mic-animation {
            width: 60px;
            height: 60px;
            border-radius: 50%;
            background-color: rgba(231, 76, 60, 0.2);
            display: flex;
            align-items: center;
            justify-content: center;
            animation: pulse 1.5s infinite;
-            box-shadow: 0 4px 8px rgba(244, 67, 54, 0.3);
+            margin: 0 auto 15px;
        }
        @keyframes pulse {
            0% {
-                transform: scale(1);
+                transform: scale(0.95);
                box-shadow: 0 0 0 0 rgba(231, 76, 60, 0.5);
            }
-            50% {
+            70% {
-                transform: scale(1.05);
+                transform: scale(1);
                box-shadow: 0 0 0 15px rgba(231, 76, 60, 0);
            }
            100% {
-                transform: scale(1);
+                transform: scale(0.95);
                box-shadow: 0 0 0 0 rgba(231, 76, 60, 0);
            }
        }
-        
+        .settings {
-        #status {
+            margin-top: 20px;
            padding: 15px;
            background-color: white;
            border-radius: 10px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }
        .settings h3 {
            margin-top: 0;
            color: #2c3e50;
            border-bottom: 1px solid #eee;
            padding-bottom: 10px;
        }
        .settings-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 15px;
        }
        .setting-item {
            padding: 10px;
            background-color: #f9f9f9;
            border-radius: 5px;
        }
        .audio-visualizer {
            height: 50px;
            width: 100%;
            background-color: #f0f0f0;
            margin-top: 10px;
            border-radius: 5px;
            overflow: hidden;
        }
        .info-message {
            text-align: center;
-            margin-top: 15px;
+            color: #7f8c8d;
            margin: 10px 0;
            font-style: italic;
            color: #657786;
        }
-        
+        .loading {
-        .audio-wave {
+            text-align: center;
-            display: flex;
+            margin: 20px 0;
            justify-content: center;
            align-items: center;
            height: 40px;
            gap: 3px;
        }
-        
+        .spinner {
-        .audio-wave span {
+            border: 4px solid rgba(0, 0, 0, 0.1);
-            display: block;
+            border-radius: 50%;
-            width: 3px;
+            border-top: 4px solid #2c3e50;
-            height: 100%;
+            width: 30px;
            background-color: #4CAF50;
            animation: wave 1.5s infinite ease-in-out;
            border-radius: 6px;
        }
        .audio-wave span:nth-child(2) {
            animation-delay: 0.2s;
        }
        .audio-wave span:nth-child(3) {
            animation-delay: 0.4s;
        }
        .audio-wave span:nth-child(4) {
            animation-delay: 0.6s;
        }
        .audio-wave span:nth-child(5) {
            animation-delay: 0.8s;
        }
        @keyframes wave {
            0%, 100% {
                height: 8px;
            }
            50% {
            height: 30px;
            animation: spin 1s linear infinite;
            margin: 0 auto 10px;
        }
        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }
-        
+        footer {
-        .hidden {
+            text-align: center;
-            display: none;
+            margin-top: 30px;
-        }
+            padding: 20px;
-        
+            color: #7f8c8d;
-        .transcription-info {
+            font-size: 14px;
            font-size: 0.8em;
            color: #888;
            margin-top: 4px;
            text-align: right;
        }
    </style>
 </head>
 <body>
-    <h1>Voice Assistant with CSM & Whisper</h1>
+    <div class="container">
-    <div id="conversation"></div>
+        <header>
            <h1>AI Voice Assistant</h1>
        </header>
-    <div id="controls">
+        <div class="status-bar">
-        <button id="recordButton">Hold to Speak</button>
+            <div class="status-indicator">
                <div class="status-dot disconnected" id="connection-dot"></div>
                <span id="connection-status">Disconnected</span>
            </div>
            <div id="runtime-info">
                <span id="models-status"></span>
            </div>
        </div>
-    <div id="audioWave" class="audio-wave hidden">
+        <div class="conversation" id="conversation">
-        <span></span>
+            <div class="info-message">Your conversation will appear here.</div>
        <span></span>
        <span></span>
        <span></span>
        <span></span>
        </div>
-    <div id="status">Connecting to server...</div>
+        <div id="mic-animation" class="mic-animation" style="display: none;">
            <svg width="24" height="24" viewBox="0 0 24 24" fill="white">
                <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
            </svg>
        </div>
-    <script>
+        <div class="controls">
-        const socket = io();
+            <button id="start-button" disabled>
-        const recordButton = document.getElementById('recordButton');
+                <svg class="button-icon" viewBox="0 0 24 24" fill="white">
-        const conversation = document.getElementById('conversation');
+                    <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
-        const status = document.getElementById('status');
+                </svg>
-        const audioWave = document.getElementById('audioWave');
+                Start Listening
            </button>
            <button id="interrupt-button" disabled>
                <svg class="button-icon" viewBox="0 0 24 24" fill="white">
                    <path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"></path>
                </svg>
                Interrupt
            </button>
        </div>
-        let mediaRecorder;
+        <div id="loading" class="loading" style="display: none;">
-        let audioChunks = [];
+            <div class="spinner"></div>
-        let isRecording = false;
+            <p id="loading-text">Processing your speech...</p>
-        let audioSendInterval;
+        </div>
        let sessionActive = false;
-        // Initialize audio context
+        <div class="settings">
-        const audioContext = new (window.AudioContext || window.webkitAudioContext)();
+            <h3>Status</h3>
            <div class="settings-grid">
                <div class="setting-item">
                    <div><strong>Whisper Model:</strong> <span id="whisper-status">Loading...</span></div>
                </div>
                <div class="setting-item">
                    <div><strong>CSM Audio Model:</strong> <span id="csm-status">Loading...</span></div>
                </div>
                <div class="setting-item">
                    <div><strong>LLM Model:</strong> <span id="llm-status">Loading...</span></div>
                </div>
                <div class="setting-item">
                    <div><strong>WebRTC:</strong> <span id="webrtc-status">Not Connected</span></div>
                </div>
            </div>
        </div>
    </div>
-        // Connect to server
+    <footer>
-        socket.on('connect', () => {
+        <p>AI Voice Assistant | Using Fast Whisper, Llama 3.2, and CSM Audio Models</p>
-            status.textContent = 'Connected to server';
+    </footer>
            sessionActive = true;
        });
-        socket.on('disconnect', () => {
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.6.1/socket.io.min.js"></script>
-            status.textContent = 'Disconnected from server';
+    <script src="./voice-chat.js"></script>
            sessionActive = false;
        });
        socket.on('ready', (data) => {
            status.textContent = data.message;
            setupAudioRecording();
        });
        socket.on('transcription', (data) => {
            addMessage('user', data.text);
            status.textContent = 'Assistant is thinking...';
        });
        socket.on('audio_response', (data) => {
            // Play audio
            status.textContent = 'Playing response...';
            const audio = new Audio('data:audio/wav;base64,' + data.audio);
            audio.onended = () => {
                status.textContent = 'Ready to record';
            };
            audio.onerror = () => {
                status.textContent = 'Error playing audio';
                console.error('Error playing audio response');
            };
            audio.play().catch(err => {
                status.textContent = 'Error playing audio: ' + err.message;
                console.error('Error playing audio:', err);
            });
            // Display text
            addMessage('bot', data.text);
        });
        socket.on('error', (data) => {
            status.textContent = 'Error: ' + data.message;
            console.error('Server error:', data.message);
        });
        function setupAudioRecording() {
            // Check if browser supports required APIs
            if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
                status.textContent = 'Your browser does not support audio recording';
                return;
            }
            // Get user media
            navigator.mediaDevices.getUserMedia({ audio: true })
                .then(stream => {
                    // Setup recording with better audio quality
                    const options = { 
                        mimeType: 'audio/webm',
                        audioBitsPerSecond: 128000 
                    };
                    try {
                        mediaRecorder = new MediaRecorder(stream, options);
                    } catch (e) {
                        // Fallback if the specified options aren't supported
                        mediaRecorder = new MediaRecorder(stream);
                    }
                    mediaRecorder.ondataavailable = event => {
                        if (event.data.size > 0) {
                            audioChunks.push(event.data);
                        }
                    };
                    mediaRecorder.onstop = () => {
                        processRecording();
                    };
                    // Create audio analyzer for visualization
                    const source = audioContext.createMediaStreamSource(stream);
                    const analyzer = audioContext.createAnalyser();
                    analyzer.fftSize = 2048;
                    source.connect(analyzer);
                    // Setup button handlers with better touch handling
                    recordButton.addEventListener('mousedown', startRecording);
                    recordButton.addEventListener('touchstart', (e) => {
                        e.preventDefault(); // Prevent default touch behavior
                        startRecording();
                    });
                    recordButton.addEventListener('mouseup', stopRecording);
                    recordButton.addEventListener('touchend', (e) => {
                        e.preventDefault();
                        stopRecording();
                    });
                    recordButton.addEventListener('mouseleave', stopRecording);
                    status.textContent = 'Ready to record';
                })
                .catch(err => {
                    status.textContent = 'Error accessing microphone: ' + err.message;
                    console.error('Error accessing microphone:', err);
                });
        }
        function startRecording() {
            if (!isRecording && sessionActive) {
                audioChunks = [];
                mediaRecorder.start(100); // Collect data in 100ms chunks
                recordButton.classList.add('recording');
                recordButton.textContent = 'Release to Stop';
                status.textContent = 'Recording...';
                audioWave.classList.remove('hidden');
                isRecording = true;
                socket.emit('start_speaking');
                // Start sending audio chunks periodically
                audioSendInterval = setInterval(() => {
                    if (mediaRecorder.state === 'recording') {
                        mediaRecorder.requestData(); // Force ondataavailable to fire
                    }
                }, 300); // Send every 300ms
            }
        }
        function stopRecording() {
            if (isRecording) {
                clearInterval(audioSendInterval);
                mediaRecorder.stop();
                recordButton.classList.remove('recording');
                recordButton.textContent = 'Hold to Speak';
                status.textContent = 'Processing speech...';
                audioWave.classList.add('hidden');
                isRecording = false;
            }
        }
        function processRecording() {
            if (audioChunks.length === 0) {
                status.textContent = 'No audio recorded';
                return;
            }
            const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
            // Convert to ArrayBuffer for processing
            const fileReader = new FileReader();
            fileReader.onloadend = () => {
                try {
                    const arrayBuffer = fileReader.result;
                    // Convert to Float32Array - this works better with WebAudio API
                    const audioData = convertToFloat32(arrayBuffer);
                    // Convert to base64 for sending
                    const base64String = arrayBufferToBase64(audioData.buffer);
                    socket.emit('audio_chunk', { audio: base64String });
                    // Signal end of speech
                    socket.emit('stop_speaking');
                } catch (e) {
                    console.error('Error processing audio:', e);
                    status.textContent = 'Error processing audio';
                }
            };
            fileReader.onerror = () => {
                status.textContent = 'Error reading audio data';
            };
            fileReader.readAsArrayBuffer(audioBlob);
        }
        function convertToFloat32(arrayBuffer) {
            // Get raw audio data as Int16 (common format for audio)
            const int16Array = new Int16Array(arrayBuffer);
            // Convert to Float32 (normalize between -1 and 1)
            const float32Array = new Float32Array(int16Array.length);
            for (let i = 0; i < int16Array.length; i++) {
                float32Array[i] = int16Array[i] / 32768.0;
            }
            return float32Array;
        }
        function addMessage(sender, text) {
            const containerDiv = document.createElement('div');
            containerDiv.className = sender === 'user' ? 'message-container user-message-container' : 'message-container bot-message-container';
            const labelDiv = document.createElement('div');
            labelDiv.className = 'message-label';
            labelDiv.textContent = sender === 'user' ? 'You' : 'Assistant';
            containerDiv.appendChild(labelDiv);
            const messageDiv = document.createElement('div');
            messageDiv.className = sender === 'user' ? 'message user-message' : 'message bot-message';
            messageDiv.textContent = text;
            containerDiv.appendChild(messageDiv);
            if (sender === 'user') {
                const infoDiv = document.createElement('div');
                infoDiv.className = 'transcription-info';
                infoDiv.textContent = 'Transcribed with Whisper';
                containerDiv.appendChild(infoDiv);
            }
            conversation.appendChild(containerDiv);
            conversation.scrollTop = conversation.scrollHeight;
        }
        function arrayBufferToBase64(buffer) {
            let binary = '';
            const bytes = new Uint8Array(buffer);
            const len = bytes.byteLength;
            for (let i = 0; i < len; i++) {
                binary += String.fromCharCode(bytes[i]);
            }
            return window.btoa(binary);
        }
        // Handle page visibility change to avoid issues with background tabs
        document.addEventListener('visibilitychange', () => {
            if (document.hidden && isRecording) {
                stopRecording();
            }
        });
        // Clean disconnection when page is closed
        window.addEventListener('beforeunload', () => {
            if (socket && socket.connected) {
                socket.disconnect();
            }
        });
    </script>
 </body>
 </html>
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -8,15 +8,14 @@ import numpy as np
 from flask import Flask, render_template, request
 from flask_socketio import SocketIO, emit
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from collections import deque
+import threading
 import queue
 import requests
 import huggingface_hub
 from generator import load_csm_1b, Segment
-
+from collections import deque
-# Force CPU mode regardless of what's available
+import json
-# This bypasses the CUDA/cuDNN library requirements
+import webrtcvad  # For voice activity detection
 os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Hide all CUDA devices
 torch.backends.cudnn.enabled = False  # Disable cuDNN
 # Configure environment with longer timeouts
 os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600"  # 10 minutes timeout for downloads
@@ -27,28 +26,92 @@ os.makedirs("models", exist_ok=True)
 app = Flask(__name__)
 app.config['SECRET_KEY'] = 'your-secret-key'
-socketio = SocketIO(app, cors_allowed_origins="*")
+socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
-# Force CPU regardless of what hardware is available
+# Explicitly check for CUDA and print detailed info
-device = "cuda" if torch.cuda.is_available() else "cpu"
+print("\n=== CUDA Information ===")
-whisper_compute_type = "int8"
+if torch.cuda.is_available():
-print(f"Forcing CPU mode for all models")
+    print(f"CUDA is available")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
 else:
    print("CUDA is not available")
 # Check for cuDNN
 try:
    import ctypes
    ctypes.CDLL("libcudnn_ops_infer.so.8")
    print("cuDNN is available")
 except:
    print("cuDNN is not available (libcudnn_ops_infer.so.8 not found)")
 # Determine compute device
 try:
    if torch.cuda.is_available():
        device = "cuda"
        whisper_compute_type = "float16"
        print("🟢 CUDA is available and initialized successfully")
    elif torch.backends.mps.is_available():
        device = "mps"
        whisper_compute_type = "float32"
        print("🟢 MPS is available (Apple Silicon)")
    else:
        device = "cpu"
        whisper_compute_type = "int8"
        print("🟡 Using CPU (CUDA/MPS not available)")
 except Exception as e:
    print(f"🔴 Error initializing CUDA: {e}")
    print("🔴 Falling back to CPU")
    device = "cpu"
    whisper_compute_type = "int8"
 print(f"Using device: {device}")
 # Initialize models with proper error handling
 whisper_model = None
 csm_generator = None
 llm_model = None
 llm_tokenizer = None
 vad = None
 # Constants
 SAMPLE_RATE = 16000  # For VAD
 VAD_FRAME_SIZE = 480  # 30ms at 16kHz for VAD
 VAD_MODE = 3  # Aggressive mode for better results
 AUDIO_CHUNK_SIZE = 2400  # 100ms chunks when streaming AI voice
 # Audio sample rates
 CLIENT_SAMPLE_RATE = 44100  # Browser WebAudio default
 WHISPER_SAMPLE_RATE = 16000  # Whisper expects 16kHz
 # Session data structures
 user_sessions = {}  # session_id -> complete session data
 # WebRTC ICE servers (STUN/TURN servers for NAT traversal)
 ICE_SERVERS = [
    {"urls": "stun:stun.l.google.com:19302"},
    {"urls": "stun:stun1.l.google.com:19302"}
 ]
 def load_models():
-    global whisper_model, csm_generator, llm_model, llm_tokenizer
+    """Load all necessary models"""
    global whisper_model, csm_generator, llm_model, llm_tokenizer, vad
    # Initialize Voice Activity Detector
    try:
        vad = webrtcvad.Vad(VAD_MODE)
        print("Voice Activity Detector initialized")
    except Exception as e:
        print(f"Error initializing VAD: {e}")
        vad = None
    # Initialize Faster-Whisper for transcription
    try:
-        print("Loading Whisper model on CPU...")
+        print("Loading Whisper model...")
        # Import here to avoid immediate import errors if package is missing
        from faster_whisper import WhisperModel
-        whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8", download_root="./models/whisper")
+        whisper_model = WhisperModel("base", device=device, compute_type=whisper_compute_type, download_root="./models/whisper")
        print("Whisper model loaded successfully")
    except Exception as e:
        print(f"Error loading Whisper model: {e}")
@@ -56,8 +119,8 @@ def load_models():
    # Initialize CSM model for audio generation
    try:
-        print("Loading CSM model on CPU...")
+        print("Loading CSM model...")
-        csm_generator = load_csm_1b(device="cpu")
+        csm_generator = load_csm_1b(device=device)
        print("CSM model loaded successfully")
    except Exception as e:
        print(f"Error loading CSM model: {e}")
@@ -65,13 +128,14 @@ def load_models():
    # Initialize Llama 3.2 model for response generation
    try:
-        print("Loading Llama 3.2 model on CPU...")
+        print("Loading Llama 3.2 model...")
-        llm_model_id = "meta-llama/Llama-3.2-1B"  # Choose appropriate size based on resources
+        llm_model_id = "meta-llama/Llama-3.2-1B"
        llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama")
        dtype = torch.bfloat16 if device != "cpu" else torch.float32
        llm_model = AutoModelForCausalLM.from_pretrained(
            llm_model_id,
-            torch_dtype=torch.float32,  # Use float32 on CPU
+            torch_dtype=dtype,
-            device_map="cpu",
+            device_map=device,
            cache_dir="./models/llama",
            low_cpu_mem_usage=True
        )
@@ -80,168 +144,344 @@ def load_models():
        print(f"Error loading Llama 3.2 model: {e}")
        print("Will use a fallback response generation method")
 # Store conversation context
 conversation_context = {}  # session_id -> context
@app.route('/')
 def index():
    """Serve the main interface"""
    return render_template('index.html')
@app.route('/voice-chat.js')
 def voice_chat_js():
    """Serve the JavaScript for voice chat"""
    return app.send_static_file('voice-chat.js')
@socketio.on('connect')
 def handle_connect():
-    print(f"Client connected: {request.sid}")
+    """Handle new client connection"""
-    conversation_context[request.sid] = {
+    session_id = request.sid
    print(f"Client connected: {session_id}")
    # Initialize session data
    user_sessions[session_id] = {
        # Conversation context
        'segments': [],
-        'speakers': [0, 1],  # 0 = user, 1 = bot
+        'conversation_history': [],
-        'audio_buffer': deque(maxlen=10),  # Store recent audio chunks
+        'is_turn_active': False,
-        'is_speaking': False,
+        
-        'silence_start': None
+        # Audio buffers and state
        'vad_buffer': deque(maxlen=30),  # ~1s of audio at 30fps
        'audio_buffer': bytearray(),
        'is_user_speaking': False,
        'last_vad_active': time.time(),
        'silence_duration': 0,
        'speech_frames': 0,
        # AI state
        'is_ai_speaking': False,
        'should_interrupt_ai': False,
        'ai_stream_queue': queue.Queue(),
        # WebRTC status
        'webrtc_connected': False,
        'webrtc_peer_id': None,
        # Processing flags
        'is_processing': False,
        'pending_user_audio': None
    }
-    emit('ready', {'message': 'Connection established'})
+    
    # Send config to client
    emit('session_ready', {
        'whisper_available': whisper_model is not None,
        'csm_available': csm_generator is not None,
        'llm_available': llm_model is not None,
        'client_sample_rate': CLIENT_SAMPLE_RATE,
        'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
        'ice_servers': ICE_SERVERS
    })
@socketio.on('disconnect')
 def handle_disconnect():
-    print(f"Client disconnected: {request.sid}")
+    """Handle client disconnection"""
-    if request.sid in conversation_context:
+    session_id = request.sid
-        del conversation_context[request.sid]
+    print(f"Client disconnected: {session_id}")
-@socketio.on('start_speaking')
+    # Clean up resources
-def handle_start_speaking():
+    if session_id in user_sessions:
-    if request.sid in conversation_context:
+        # Signal any running threads to stop
-        conversation_context[request.sid]['is_speaking'] = True
+        user_sessions[session_id]['should_interrupt_ai'] = True
        conversation_context[request.sid]['audio_buffer'].clear()
        print(f"User {request.sid} started speaking")
-@socketio.on('audio_chunk')
+        # Clean up resources
-def handle_audio_chunk(data):
+        del user_sessions[session_id]
-    if request.sid not in conversation_context:
+
@socketio.on('webrtc_signal')
 def handle_webrtc_signal(data):
    """Handle WebRTC signaling for P2P connection establishment"""
    session_id = request.sid
    if session_id not in user_sessions:
        return
-    context = conversation_context[request.sid]
+    # Simply relay the signal to the client
    # In a multi-user app, we would route this to the correct peer
    emit('webrtc_signal', data)
@socketio.on('webrtc_connected')
 def handle_webrtc_connected(data):
    """Client notifies that WebRTC connection is established"""
    session_id = request.sid
    if session_id not in user_sessions:
        return
    user_sessions[session_id]['webrtc_connected'] = True
    print(f"WebRTC connected for session {session_id}")
    emit('ready_for_speech', {'message': 'Ready to start conversation'})
@socketio.on('audio_stream')
 def handle_audio_stream(data):
    """Process incoming audio stream packets from client"""
    session_id = request.sid
    if session_id not in user_sessions:
        return
    session = user_sessions[session_id]
    try:
        # Decode audio data
-    audio_data = base64.b64decode(data['audio'])
+        audio_bytes = base64.b64decode(data.get('audio', ''))
-    audio_numpy = np.frombuffer(audio_data, dtype=np.float32)
+        if not audio_bytes or len(audio_bytes) < 2:  # Need at least one sample
    audio_tensor = torch.tensor(audio_numpy)
    # Add to buffer
    context['audio_buffer'].append(audio_tensor)
    # Check for silence to detect end of speech
    if context['is_speaking'] and is_silence(audio_tensor):
        if context['silence_start'] is None:
            context['silence_start'] = time.time()
        elif time.time() - context['silence_start'] > 1.0:  # 1 second of silence
            # Process the complete utterance
            process_user_utterance(request.sid)
    else:
        context['silence_start'] = None
@socketio.on('stop_speaking')
 def handle_stop_speaking():
    if request.sid in conversation_context:
        conversation_context[request.sid]['is_speaking'] = False
        process_user_utterance(request.sid)
        print(f"User {request.sid} stopped speaking")
 def is_silence(audio_tensor, threshold=0.02):
    """Check if an audio chunk is silence based on amplitude threshold"""
    return torch.mean(torch.abs(audio_tensor)) < threshold
 def process_user_utterance(session_id):
    """Process completed user utterance, generate response and send audio back"""
    context = conversation_context[session_id]
    if not context['audio_buffer']:
            return
-    # Combine audio chunks
+        # Add to current audio buffer
-    full_audio = torch.cat(list(context['audio_buffer']), dim=0)
+        session['audio_buffer'] += audio_bytes
    context['audio_buffer'].clear()
    context['is_speaking'] = False
    context['silence_start'] = None
-    # Save audio to temporary WAV file for transcription
+        # Check for speech using VAD
        has_speech = detect_speech(audio_bytes, session_id)
        # Handle speech state machine
        if has_speech:
            # Reset silence tracking when speech is detected
            session['last_vad_active'] = time.time()
            session['silence_duration'] = 0
            session['speech_frames'] += 1
            # If not already marked as speaking and we have enough speech frames
            if not session['is_user_speaking'] and session['speech_frames'] >= 5:
                on_speech_started(session_id)
        else:
            # No speech detected in this frame
            if session['is_user_speaking']:
                # Calculate silence duration
                now = time.time()
                session['silence_duration'] = now - session['last_vad_active']
                # If silent for more than 0.5 seconds, end speech segment
                if session['silence_duration'] > 0.8 and session['speech_frames'] > 8:
                    on_speech_ended(session_id)
            else:
                # Not speaking and no speech, just a silent frame
                session['speech_frames'] = max(0, session['speech_frames'] - 1)
    except Exception as e:
        print(f"Error processing audio stream: {e}")
 def detect_speech(audio_bytes, session_id):
    """Use VAD to check if audio contains speech"""
    if session_id not in user_sessions:
        return False
    session = user_sessions[session_id]
    # Store in VAD buffer for history
    session['vad_buffer'].append(audio_bytes)
    if vad is None:
        # Fallback to simple energy detection
        audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
        energy = np.mean(np.abs(audio_data)) / 32768.0
        return energy > 0.015  # Simple threshold
    try:
        # Ensure we have the right amount of data for VAD
        audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
        # If we have too much data, use just the right amount
        if len(audio_data) >= VAD_FRAME_SIZE:
            frame = audio_data[:VAD_FRAME_SIZE].tobytes()
            return vad.is_speech(frame, SAMPLE_RATE)
        # If too little data, accumulate in the VAD buffer and check periodically
        if len(session['vad_buffer']) >= 3:
            # Combine recent chunks to get enough data
            combined = bytearray()
            for chunk in list(session['vad_buffer'])[-3:]:
                combined.extend(chunk)
            # Extract the right amount of data
            if len(combined) >= VAD_FRAME_SIZE:
                frame = combined[:VAD_FRAME_SIZE]
                return vad.is_speech(bytes(frame), SAMPLE_RATE)
        return False
    except Exception as e:
        print(f"VAD error: {e}")
        return False
 def on_speech_started(session_id):
    """Handle start of user speech"""
    if session_id not in user_sessions:
        return
    session = user_sessions[session_id]
    # Reset audio buffer 
    session['audio_buffer'] = bytearray()
    session['is_user_speaking'] = True
    session['is_turn_active'] = True
    # If AI is speaking, we need to interrupt it
    if session['is_ai_speaking']:
        session['should_interrupt_ai'] = True
        emit('ai_interrupted_by_user', room=session_id)
    # Notify client that we detected speech
    emit('user_speech_start', room=session_id)
 def on_speech_ended(session_id):
    """Handle end of user speech segment"""
    if session_id not in user_sessions:
        return
    session = user_sessions[session_id]
    # Mark as not speaking anymore
    session['is_user_speaking'] = False
    session['speech_frames'] = 0
    # If no audio or already processing, skip
    if len(session['audio_buffer']) < 4000 or session['is_processing']:  # At least 250ms of audio
        session['audio_buffer'] = bytearray()
        return
    # Mark as processing to prevent multiple processes
    session['is_processing'] = True
    # Create a copy of the audio buffer
    audio_copy = session['audio_buffer']
    session['audio_buffer'] = bytearray()
    # Convert audio to the format needed for processing
    try:
        # Convert to float32 between -1 and 1
        audio_np = np.frombuffer(audio_copy, dtype=np.int16).astype(np.float32) / 32768.0
        audio_tensor = torch.from_numpy(audio_np)
        # Resample to Whisper's expected sample rate if necessary
        if CLIENT_SAMPLE_RATE != WHISPER_SAMPLE_RATE:
            audio_tensor = torchaudio.functional.resample(
                audio_tensor, 
                orig_freq=CLIENT_SAMPLE_RATE, 
                new_freq=WHISPER_SAMPLE_RATE
            )
        # Save as WAV for transcription
        temp_audio_path = f"temp_audio_{session_id}.wav"
        torchaudio.save(
            temp_audio_path, 
-        full_audio.unsqueeze(0), 
+            audio_tensor.unsqueeze(0), 
-        44100  # Assuming 44.1kHz from client
+            WHISPER_SAMPLE_RATE
        )
-    try:
+        # Start transcription and response process in a thread
-        # Try using Whisper first if available
+        threading.Thread(
-        if whisper_model is not None:
+            target=process_user_utterance,
-            user_text = transcribe_with_whisper(temp_audio_path)
+            args=(session_id, temp_audio_path, audio_tensor),
-        else:
+            daemon=True
-            # Fallback to Google's speech recognition
+        ).start()
            user_text = transcribe_with_google(temp_audio_path)
-        if not user_text:
+        # Notify client that processing has started
-            print("No speech detected.")
+        emit('processing_speech', room=session_id)
-            emit('error', {'message': 'No speech detected. Please try again.'}, room=session_id)
+    
    except Exception as e:
        print(f"Error preparing audio: {e}")
        session['is_processing'] = False
        emit('error', {'message': f'Error processing audio: {str(e)}'}, room=session_id)
 def process_user_utterance(session_id, audio_path, audio_tensor):
    """Process user utterance, transcribe and generate response"""
    if session_id not in user_sessions:
        return
    session = user_sessions[session_id]
    try:
        # Transcribe audio
        if whisper_model is not None:
            user_text = transcribe_with_whisper(audio_path)
        else:
            # Fallback to another transcription service
            user_text = transcribe_fallback(audio_path)
        # Clean up temp file
        if os.path.exists(audio_path):
            os.remove(audio_path)
        # Check if we got meaningful text
        if not user_text or len(user_text.strip()) < 2:
            emit('no_speech_detected', room=session_id)
            session['is_processing'] = False
            return
        print(f"Transcribed: {user_text}")
-        # Add to conversation segments
+        # Create user segment
        user_segment = Segment(
            text=user_text,
            speaker=0,  # User is speaker 0
-            audio=full_audio
+            audio=audio_tensor
        )
-        context['segments'].append(user_segment)
+        session['segments'].append(user_segment)
-        # Generate bot response
+        # Update conversation history
-        bot_response = generate_llm_response(user_text, context['segments'])
+        session['conversation_history'].append({
-        print(f"Bot response: {bot_response}")
+            'role': 'user',
            'text': user_text
        })
-        # Send transcribed text to client
+        # Send transcription to client
        emit('transcription', {'text': user_text}, room=session_id)
-        # Generate and send audio response if CSM is available
+        # Generate AI response
        ai_response = generate_ai_response(user_text, session_id)
        # Send text response to client
        emit('ai_response_text', {'text': ai_response}, room=session_id)
        # Update conversation history
        session['conversation_history'].append({
            'role': 'assistant',
            'text': ai_response
        })
        # Generate voice response if CSM is available
        if csm_generator is not None:
-            # Convert to audio using CSM
+            session['is_ai_speaking'] = True
-            bot_audio = generate_audio_response(bot_response, context['segments'])
+            session['should_interrupt_ai'] = False
-            # Convert audio to base64 for sending over websocket
+            # Begin streaming audio response
-            audio_bytes = io.BytesIO()
+            threading.Thread(
-            torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav")
+                target=stream_ai_response,
-            audio_bytes.seek(0)
+                args=(ai_response, session_id),
-            audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
+                daemon=True
-            
+            ).start()
            # Add bot response to conversation history
            bot_segment = Segment(
                text=bot_response,
                speaker=1,  # Bot is speaker 1
                audio=bot_audio
            )
            context['segments'].append(bot_segment)
            # Send audio response to client
            emit('audio_response', {
                'audio': audio_b64,
                'text': bot_response
            }, room=session_id)
        else:
            # Send text-only response if audio generation isn't available
            emit('text_response', {'text': bot_response}, room=session_id)
            # Add text-only bot response to conversation history
            bot_segment = Segment(
                text=bot_response,
                speaker=1,  # Bot is speaker 1
                audio=torch.zeros(1)  # Placeholder empty audio
            )
            context['segments'].append(bot_segment)
    except Exception as e:
-        print(f"Error processing speech: {e}")
+        print(f"Error processing utterance: {e}")
-        emit('error', {'message': f'Error processing speech: {str(e)}'}, room=session_id)
+        emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
    finally:
-        # Cleanup temp file
+        # Clear processing flag
-        if os.path.exists(temp_audio_path):
+        if session_id in user_sessions:
-            os.remove(temp_audio_path)
+            session['is_processing'] = False
 def transcribe_with_whisper(audio_path):
    """Transcribe audio using Faster-Whisper"""
@@ -250,14 +490,13 @@ def transcribe_with_whisper(audio_path):
    # Collect all text from segments
    user_text = ""
    for segment in segments:
-        segment_text = segment.text.strip()
+        user_text += segment.text.strip() + " "
        print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment_text}")
        user_text += segment_text + " "
    return user_text.strip()
-def transcribe_with_google(audio_path):
+def transcribe_fallback(audio_path):
    """Fallback transcription using Google's speech recognition"""
    try:
        import speech_recognition as sr
        recognizer = sr.Recognizer()
@@ -269,28 +508,40 @@ def transcribe_with_google(audio_path):
            except sr.UnknownValueError:
                return ""
            except sr.RequestError:
-            # If Google API fails, try a basic energy-based VAD approach
+                return "[Speech recognition service unavailable]"
-            # This is a very basic fallback and won't give good results
+    except ImportError:
-            return "[Speech detected but transcription failed]"
+        return "[Speech recognition not available]"
 def generate_ai_response(user_text, session_id):
    """Generate text response using available LLM"""
    if session_id not in user_sessions:
        return "I'm sorry, your session has expired."
    session = user_sessions[session_id]
 def generate_llm_response(user_text, conversation_segments):
    """Generate text response using available model"""
    if llm_model is not None and llm_tokenizer is not None:
        # Format conversation history for the LLM
-        conversation_history = ""
+        prompt = "You are a helpful, friendly voice assistant. Keep your responses brief and conversational.\n\n"
        for segment in conversation_segments[-5:]:  # Use last 5 utterances for context
            speaker_name = "User" if segment.speaker == 0 else "Assistant"
            conversation_history += f"{speaker_name}: {segment.text}\n"
-        # Add the current user query
+        # Add recent conversation history (last 6 turns maximum)
-        conversation_history += f"User: {user_text}\nAssistant:"
+        for entry in session['conversation_history'][-6:]:
            if entry['role'] == 'user':
                prompt += f"User: {entry['text']}\n"
            else:
                prompt += f"Assistant: {entry['text']}\n"
        # Add current query if not already in history
        if not session['conversation_history'] or session['conversation_history'][-1]['role'] != 'user':
            prompt += f"User: {user_text}\n"
        prompt += "Assistant: "
        try:
            # Generate response
-            inputs = llm_tokenizer(conversation_history, return_tensors="pt").to(device)
+            inputs = llm_tokenizer(prompt, return_tensors="pt").to(device)
            output = llm_model.generate(
                inputs.input_ids, 
-                max_new_tokens=150,
+                max_new_tokens=100,  # Keep responses shorter for voice
                temperature=0.7,
                top_p=0.9,
                do_sample=True
@@ -298,40 +549,48 @@ def generate_llm_response(user_text, conversation_segments):
            response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
            return response.strip()
        except Exception as e:
-            print(f"Error generating response with LLM: {e}")
+            print(f"Error generating LLM response: {e}")
            return fallback_response(user_text)
    else:
        return fallback_response(user_text)
 def fallback_response(user_text):
-    """Generate a simple fallback response when LLM is not available"""
+    """Generate simple fallback responses when LLM is unavailable"""
    # Simple rule-based responses
    user_text_lower = user_text.lower()
    if "hello" in user_text_lower or "hi" in user_text_lower:
-        return "Hello! I'm a simple fallback assistant. The main language model couldn't be loaded, so I have limited capabilities."
+        return "Hello! How can I help you today?"
    elif "how are you" in user_text_lower:
-        return "I'm functioning within my limited capabilities. How can I assist you today?"
+        return "I'm doing well, thanks for asking! How about you?"
    elif "thank" in user_text_lower:
-        return "You're welcome! Let me know if there's anything else I can help with."
+        return "You're welcome! Happy to help."
    elif "bye" in user_text_lower or "goodbye" in user_text_lower:
        return "Goodbye! Have a great day!"
    elif any(q in user_text_lower for q in ["what", "who", "where", "when", "why", "how"]):
-        return "I'm running in fallback mode and can't answer complex questions. Please try again when the main language model is available."
+        return "That's an interesting question. I wish I could provide a better answer in my current fallback mode."
    else:
-        return "I understand you said something about that. Unfortunately, I'm running in fallback mode with limited capabilities. Please try again later when the main model is available."
+        return "I see. Tell me more about that."
 def stream_ai_response(text, session_id):
    """Generate and stream audio response in real-time chunks"""
    if session_id not in user_sessions:
        return
    session = user_sessions[session_id]
 def generate_audio_response(text, conversation_segments):
    """Generate audio response using CSM"""
    try:
-        # Use the last few conversation segments as context
+        # Signal start of AI speech
-        context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments
+        emit('ai_speech_start', room=session_id)
        # Use the last few conversation segments as context (up to 4)
        context_segments = session['segments'][-4:] if len(session['segments']) > 4 else session['segments']
        # Generate audio for bot response
        audio = csm_generator.generate(
@@ -343,11 +602,77 @@ def generate_audio_response(text, conversation_segments):
            topk=50
        )
-        return audio
+        # Create and store bot segment
        bot_segment = Segment(
            text=text,
            speaker=1,
            audio=audio
        )
        if session_id in user_sessions:
            session['segments'].append(bot_segment)
        # Stream audio in small chunks for more responsive playback
        chunk_size = AUDIO_CHUNK_SIZE  # Size defined in constants
        for i in range(0, len(audio), chunk_size):
            # Check if we should stop (user interrupted)
            if session_id not in user_sessions or session['should_interrupt_ai']:
                print("AI speech interrupted")
                break
            # Get next chunk
            chunk = audio[i:i+chunk_size]
            # Convert audio chunk to base64 for streaming
            audio_bytes = io.BytesIO()
            torchaudio.save(audio_bytes, chunk.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav")
            audio_bytes.seek(0)
            audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
            # Send chunk to client
            socketio.emit('ai_speech_chunk', {
                'audio': audio_b64,
                'is_last': i + chunk_size >= len(audio)
            }, room=session_id)
            # Small sleep for more natural pacing
            time.sleep(0.06)  # Slight delay for smoother playback
        # Signal end of AI speech
        if session_id in user_sessions:
            session['is_ai_speaking'] = False
            session['is_turn_active'] = False  # End conversation turn
            socketio.emit('ai_speech_end', room=session_id)
    except Exception as e:
-        print(f"Error generating audio: {e}")
+        print(f"Error streaming AI response: {e}")
-        # Return silence as fallback
+        if session_id in user_sessions:
-        return torch.zeros(csm_generator.sample_rate * 3)  # 3 seconds of silence
+            session['is_ai_speaking'] = False
            session['is_turn_active'] = False
            socketio.emit('error', {'message': f'Error generating audio: {str(e)}'}, room=session_id)
            socketio.emit('ai_speech_end', room=session_id)
@socketio.on('interrupt_ai')
 def handle_interrupt():
    """Handle explicit AI interruption request from client"""
    session_id = request.sid
    if session_id in user_sessions:
        user_sessions[session_id]['should_interrupt_ai'] = True
        emit('ai_interrupted', room=session_id)
@socketio.on('get_config')
 def handle_get_config():
    """Send configuration to client"""
    session_id = request.sid
    if session_id in user_sessions:
        emit('config', {
            'client_sample_rate': CLIENT_SAMPLE_RATE,
            'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
            'whisper_available': whisper_model is not None,
            'csm_available': csm_generator is not None,
            'ice_servers': ICE_SERVERS
        })
 if __name__ == '__main__':
    # Ensure the existing index.html file is in the correct location
@@ -357,9 +682,8 @@ if __name__ == '__main__':
    if os.path.exists('index.html') and not os.path.exists('templates/index.html'):
        os.rename('index.html', 'templates/index.html')
-    # Load models asynchronously before starting the server
+    # Load models before starting the server
-    print("Starting CPU-only model loading...")
+    print("Starting model loading...")
    # In a production environment, you could load models in a separate thread
    load_models()
    # Start the server
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -0,0 +1,560 @@
 document.addEventListener('DOMContentLoaded', () => {
    // DOM Elements
    const startButton = document.getElementById('start-button');
    const interruptButton = document.getElementById('interrupt-button');
    const conversationDiv = document.getElementById('conversation');
    const connectionDot = document.getElementById('connection-dot');
    const connectionStatus = document.getElementById('connection-status');
    const whisperStatus = document.getElementById('whisper-status');
    const csmStatus = document.getElementById('csm-status');
    const llmStatus = document.getElementById('llm-status');
    const webrtcStatus = document.getElementById('webrtc-status');
    const micAnimation = document.getElementById('mic-animation');
    const loadingDiv = document.getElementById('loading');
    const loadingText = document.getElementById('loading-text');
    // State variables
    let socket;
    let isConnected = false;
    let isListening = false;
    let isAiSpeaking = false;
    let audioContext;
    let mediaStream;
    let audioRecorder;
    let audioProcessor;
    const audioChunks = [];
    // WebRTC variables
    let peerConnection;
    let dataChannel;
    let hasActiveConnection = false;
    // Audio playback
    let audioQueue = [];
    let isPlaying = false;
    // Configuration variables
    let serverSampleRate = 24000;
    let clientSampleRate = 44100;
    let iceServers = [];
    // Initialize the application
    initApp();
    // Main initialization function
    function initApp() {
        updateConnectionStatus('connecting');
        setupSocketConnection();
        setupEventListeners();
    }
    // Set up Socket.IO connection with server
    function setupSocketConnection() {
        socket = io();
        socket.on('connect', () => {
            console.log('Connected to server');
            updateConnectionStatus('connected');
            isConnected = true;
        });
        socket.on('disconnect', () => {
            console.log('Disconnected from server');
            updateConnectionStatus('disconnected');
            isConnected = false;
            cleanupAudio();
            cleanupWebRTC();
        });
        socket.on('session_ready', (data) => {
            console.log('Session ready:', data);
            updateModelStatus(data);
            clientSampleRate = data.client_sample_rate;
            serverSampleRate = data.server_sample_rate;
            iceServers = data.ice_servers;
            // Initialize WebRTC if models are available
            if (data.whisper_available && data.llm_available) {
                initializeWebRTC();
            }
        });
        socket.on('ready_for_speech', (data) => {
            console.log('Ready for speech:', data);
            startButton.disabled = false;
            addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
        });
        socket.on('webrtc_signal', (data) => {
            handleWebRTCSignal(data);
        });
        socket.on('transcription', (data) => {
            console.log('Transcription:', data);
            addUserMessage(data.text);
            loadingDiv.style.display = 'none';
        });
        socket.on('ai_response_text', (data) => {
            console.log('AI response text:', data);
            addAIMessage(data.text);
            loadingDiv.style.display = 'none';
        });
        socket.on('ai_speech_start', () => {
            console.log('AI started speaking');
            isAiSpeaking = true;
            interruptButton.disabled = false;
        });
        socket.on('ai_speech_chunk', (data) => {
            console.log('Received AI speech chunk');
            playAudioChunk(data.audio, data.is_last);
        });
        socket.on('ai_speech_end', () => {
            console.log('AI stopped speaking');
            isAiSpeaking = false;
            interruptButton.disabled = true;
        });
        socket.on('user_speech_start', () => {
            console.log('User speech detected');
            showSpeakingIndicator(true);
        });
        socket.on('processing_speech', () => {
            console.log('Processing speech');
            showSpeakingIndicator(false);
            showLoadingIndicator('Processing your speech...');
        });
        socket.on('no_speech_detected', () => {
            console.log('No speech detected');
            hideLoadingIndicator();
            addInfoMessage('No speech detected. Please try again.');
        });
        socket.on('ai_interrupted', () => {
            console.log('AI interrupted');
            clearAudioQueue();
            isAiSpeaking = false;
            interruptButton.disabled = true;
        });
        socket.on('ai_interrupted_by_user', () => {
            console.log('AI interrupted by user');
            clearAudioQueue();
            isAiSpeaking = false;
            interruptButton.disabled = true;
            addInfoMessage('AI interrupted by your speech');
        });
        socket.on('error', (data) => {
            console.error('Server error:', data);
            hideLoadingIndicator();
            addInfoMessage(`Error: ${data.message}`);
        });
    }
    // Set up UI event listeners
    function setupEventListeners() {
        startButton.addEventListener('click', toggleListening);
        interruptButton.addEventListener('click', interruptAI);
    }
    // Update UI connection status
    function updateConnectionStatus(status) {
        connectionDot.className = 'status-dot ' + status;
        switch (status) {
            case 'connected':
                connectionStatus.textContent = 'Connected';
                break;
            case 'connecting':
                connectionStatus.textContent = 'Connecting...';
                break;
            case 'disconnected':
                connectionStatus.textContent = 'Disconnected';
                startButton.disabled = true;
                interruptButton.disabled = true;
                break;
        }
    }
    // Update model status indicators
    function updateModelStatus(data) {
        whisperStatus.textContent = data.whisper_available ? 'Available' : 'Not Available';
        whisperStatus.style.color = data.whisper_available ? 'green' : 'red';
        csmStatus.textContent = data.csm_available ? 'Available' : 'Not Available';
        csmStatus.style.color = data.csm_available ? 'green' : 'red';
        llmStatus.textContent = data.llm_available ? 'Available' : 'Not Available';
        llmStatus.style.color = data.llm_available ? 'green' : 'red';
    }
    // Initialize WebRTC connection
    function initializeWebRTC() {
        if (!isConnected) return;
        const configuration = {
            iceServers: iceServers
        };
        peerConnection = new RTCPeerConnection(configuration);
        // Create data channel for WebRTC communication
        dataChannel = peerConnection.createDataChannel('audioData', {
            ordered: true
        });
        dataChannel.onopen = () => {
            console.log('WebRTC data channel open');
            hasActiveConnection = true;
            webrtcStatus.textContent = 'Connected';
            webrtcStatus.style.color = 'green';
            socket.emit('webrtc_connected', { status: 'connected' });
        };
        dataChannel.onclose = () => {
            console.log('WebRTC data channel closed');
            hasActiveConnection = false;
            webrtcStatus.textContent = 'Disconnected';
            webrtcStatus.style.color = 'red';
        };
        // Handle ICE candidates
        peerConnection.onicecandidate = (event) => {
            if (event.candidate) {
                socket.emit('webrtc_signal', {
                    type: 'ice_candidate',
                    candidate: event.candidate
                });
            }
        };
        // Log ICE connection state changes
        peerConnection.oniceconnectionstatechange = () => {
            console.log('ICE connection state:', peerConnection.iceConnectionState);
        };
        // Create offer
        peerConnection.createOffer()
            .then(offer => peerConnection.setLocalDescription(offer))
            .then(() => {
                socket.emit('webrtc_signal', {
                    type: 'offer',
                    sdp: peerConnection.localDescription
                });
            })
            .catch(error => {
                console.error('Error creating WebRTC offer:', error);
                webrtcStatus.textContent = 'Failed to Connect';
                webrtcStatus.style.color = 'red';
            });
    }
    // Handle WebRTC signals from the server
    function handleWebRTCSignal(data) {
        if (!peerConnection) return;
        if (data.type === 'answer') {
            peerConnection.setRemoteDescription(new RTCSessionDescription(data.sdp))
                .catch(error => console.error('Error setting remote description:', error));
        } 
        else if (data.type === 'ice_candidate') {
            peerConnection.addIceCandidate(new RTCIceCandidate(data.candidate))
                .catch(error => console.error('Error adding ICE candidate:', error));
        }
    }
    // Clean up WebRTC connection
    function cleanupWebRTC() {
        if (dataChannel) {
            dataChannel.close();
        }
        if (peerConnection) {
            peerConnection.close();
        }
        dataChannel = null;
        peerConnection = null;
        hasActiveConnection = false;
        webrtcStatus.textContent = 'Not Connected';
        webrtcStatus.style.color = 'red';
    }
    // Toggle audio listening
    function toggleListening() {
        if (isListening) {
            stopListening();
        } else {
            startListening();
        }
    }
    // Start listening for audio
    async function startListening() {
        if (!isConnected) return;
        try {
            await initAudio();
            isListening = true;
            startButton.textContent = 'Stop Listening';
            startButton.innerHTML = `
                <svg class="button-icon" viewBox="0 0 24 24" fill="white">
                    <path d="M6 6h12v12H6z"></path>
                </svg>
                Stop Listening
            `;
        } catch (error) {
            console.error('Error starting audio:', error);
            addInfoMessage('Error accessing microphone. Please check permissions.');
        }
    }
    // Stop listening for audio
    function stopListening() {
        cleanupAudio();
        isListening = false;
        startButton.innerHTML = `
            <svg class="button-icon" viewBox="0 0 24 24" fill="white">
                <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
            </svg>
            Start Listening
        `;
        showSpeakingIndicator(false);
    }
    // Initialize audio capture
    async function initAudio() {
        // Request microphone access
        mediaStream = await navigator.mediaDevices.getUserMedia({
            audio: {
                sampleRate: clientSampleRate,
                channelCount: 1,
                echoCancellation: true,
                noiseSuppression: true,
                autoGainControl: true
            }
        });
        // Initialize AudioContext
        audioContext = new (window.AudioContext || window.webkitAudioContext)({
            sampleRate: clientSampleRate
        });
        // Create audio source from stream
        const source = audioContext.createMediaStreamSource(mediaStream);
        // Create ScriptProcessor for audio processing
        const bufferSize = 4096;
        audioProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
        // Process audio data
        audioProcessor.onaudioprocess = (event) => {
            if (!isListening || isAiSpeaking) return;
            const input = event.inputBuffer.getChannelData(0);
            const audioData = convertFloat32ToInt16(input);
            sendAudioChunk(audioData);
        };
        // Connect the nodes
        source.connect(audioProcessor);
        audioProcessor.connect(audioContext.destination);
    }
    // Clean up audio resources
    function cleanupAudio() {
        if (audioProcessor) {
            audioProcessor.disconnect();
            audioProcessor = null;
        }
        if (mediaStream) {
            mediaStream.getTracks().forEach(track => track.stop());
            mediaStream = null;
        }
        if (audioContext && audioContext.state !== 'closed') {
            audioContext.close().catch(error => console.error('Error closing AudioContext:', error));
        }
        audioChunks.length = 0;
    }
    // Convert Float32Array to Int16Array for sending to server
    function convertFloat32ToInt16(float32Array) {
        const int16Array = new Int16Array(float32Array.length);
        for (let i = 0; i < float32Array.length; i++) {
            // Convert float [-1.0, 1.0] to int16 [-32768, 32767]
            int16Array[i] = Math.max(-32768, Math.min(32767, Math.floor(float32Array[i] * 32768)));
        }
        return int16Array;
    }
    // Send audio chunk to server
    function sendAudioChunk(audioData) {
        if (!isConnected || !isListening) return;
        // Convert to base64 for transmission
        const base64Audio = arrayBufferToBase64(audioData.buffer);
        // Send via Socket.IO (could use WebRTC's DataChannel for lower latency in production)
        socket.emit('audio_stream', { audio: base64Audio });
    }
    // Play audio chunk received from server
    function playAudioChunk(base64Audio, isLast) {
        const audioData = base64ToArrayBuffer(base64Audio);
        // Add to queue
        audioQueue.push({
            data: audioData,
            isLast: isLast
        });
        // Start playing if not already playing
        if (!isPlaying) {
            playNextAudioChunk();
        }
    }
    // Play the next audio chunk in the queue
    function playNextAudioChunk() {
        if (audioQueue.length === 0) {
            isPlaying = false;
            return;
        }
        isPlaying = true;
        const chunk = audioQueue.shift();
        try {
            // Create audio context if needed
            if (!audioContext || audioContext.state === 'closed') {
                audioContext = new (window.AudioContext || window.webkitAudioContext)();
            }
            // Resume audio context if suspended
            if (audioContext.state === 'suspended') {
                audioContext.resume();
            }
            // Decode the WAV data
            audioContext.decodeAudioData(chunk.data, (buffer) => {
                const source = audioContext.createBufferSource();
                source.buffer = buffer;
                source.connect(audioContext.destination);
                // When playback ends, play the next chunk
                source.onended = () => {
                    playNextAudioChunk();
                };
                source.start(0);
                // If it's the last chunk, update UI
                if (chunk.isLast) {
                    setTimeout(() => {
                        isAiSpeaking = false;
                        interruptButton.disabled = true;
                    }, buffer.duration * 1000);
                }
            }, (error) => {
                console.error('Error decoding audio data:', error);
                playNextAudioChunk(); // Skip this chunk and try the next
            });
        } catch (error) {
            console.error('Error playing audio chunk:', error);
            playNextAudioChunk(); // Try the next chunk
        }
    }
    // Clear the audio queue (used when interrupting)
    function clearAudioQueue() {
        audioQueue.length = 0;
        isPlaying = false;
        // Stop any currently playing audio
        if (audioContext) {
            audioContext.suspend();
        }
    }
    // Send interrupt signal to server
    function interruptAI() {
        if (!isConnected || !isAiSpeaking) return;
        socket.emit('interrupt_ai');
        clearAudioQueue();
    }
    // Convert ArrayBuffer to Base64 string
    function arrayBufferToBase64(buffer) {
        const binary = new Uint8Array(buffer);
        let base64 = '';
        const len = binary.byteLength;
        for (let i = 0; i < len; i++) {
            base64 += String.fromCharCode(binary[i]);
        }
        return window.btoa(base64);
    }
    // Convert Base64 string to ArrayBuffer
    function base64ToArrayBuffer(base64) {
        const binaryString = window.atob(base64);
        const len = binaryString.length;
        const bytes = new Uint8Array(len);
        for (let i = 0; i < len; i++) {
            bytes[i] = binaryString.charCodeAt(i);
        }
        return bytes.buffer;
    }
    // Add user message to conversation
    function addUserMessage(text) {
        const messageDiv = document.createElement('div');
        messageDiv.className = 'message user-message';
        messageDiv.textContent = text;
        conversationDiv.appendChild(messageDiv);
        conversationDiv.scrollTop = conversationDiv.scrollHeight;
    }
    // Add AI message to conversation
    function addAIMessage(text) {
        const messageDiv = document.createElement('div');
        messageDiv.className = 'message ai-message';
        messageDiv.textContent = text;
        conversationDiv.appendChild(messageDiv);
        conversationDiv.scrollTop = conversationDiv.scrollHeight;
    }
    // Add info message to conversation
    function addInfoMessage(text) {
        const messageDiv = document.createElement('div');
        messageDiv.className = 'info-message';
        messageDiv.textContent = text;
        conversationDiv.appendChild(messageDiv);
        conversationDiv.scrollTop = conversationDiv.scrollHeight;
    }
    // Show/hide speaking indicator
    function showSpeakingIndicator(show) {
        micAnimation.style.display = show ? 'flex' : 'none';
    }
    // Show loading indicator
    function showLoadingIndicator(text) {
        loadingText.textContent = text || 'Processing...';
        loadingDiv.style.display = 'block';
    }
    // Hide loading indicator
    function hideLoadingIndicator() {
        loadingDiv.style.display = 'none';
    }
 });
--- a/React/public/icon-128x128.png
+++ b/React/public/icon-128x128.png
--- a/React/public/icon-512x512.png
+++ b/React/public/icon-512x512.png
--- a/React/src/app/layout.tsx
+++ b/React/src/app/layout.tsx
@@ -13,8 +13,8 @@ const geistMono = Geist_Mono({
 });
 export const metadata: Metadata = {
-	title: "Create Next App",
+	title: "Fauxcall",
-	description: "Generated by create next app",
+	description: "Fauxcall is a fake call app that helps you get out of awkward situations.",
 };
 export default function RootLayout({
--- a/React/src/app/manifest.ts
+++ b/React/src/app/manifest.ts
@@ -0,0 +1,25 @@
 import type { MetadataRoute } from 'next'
 export default function manifest(): MetadataRoute.Manifest {
  return {
    name: 'Fauxcall',
    short_name: 'Fauxcall',
    description: 'A fake call app that helps you get out of awkward and dangerous situations.',
    start_url: '/',
    display: 'standalone',
    background_color: '#ffffff',
    theme_color: '#000000',
    icons: [
      {
        src: '/icon-192x192.png',
        sizes: '192x192',
        type: 'image/png',
      },
      {
        src: '/icon-512x512.png',
        sizes: '512x512',
        type: 'image/png',
      },
    ],
  }
 }
--- a/React/src/app/page.tsx
+++ b/React/src/app/page.tsx
@@ -4,7 +4,7 @@ import { useRouter } from "next/navigation";
 import './styles.css';
 export default function Home() {
-	const [contacts, setContacts] = useState<string[]>([]);
+	const [contacts, setContacts] = useState<string[]>([""]);
 	const [codeword, setCodeword] = useState("");
 	const [session, setSession] = useState<any>(null);
 	const [loading, setLoading] = useState(true);
@@ -26,6 +26,16 @@ export default function Home() {
 			});
 	}, []);
 	const handleInputChange = (index: number, value: string) => {
 		const updatedContacts = [...contacts];
 		updatedContacts[index] = value; // Update the specific input value
 		setContacts(updatedContacts);
 	};
 	const addContactInput = () => {
 		setContacts([...contacts, ""]); // Add a new empty input
 	};
 	function saveToDB() {
 		alert("Saving contacts...");
 		const contactInputs = document.querySelectorAll(
@@ -144,27 +154,20 @@ export default function Home() {
 					className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start"
 					onSubmit={(e) => e.preventDefault()}
 				>
 					{contacts.map((contact, index) => (
 						<input
 							key={index}
 							type="text"
-						value={contacts}
+							value={contact}
-						onChange={(e) => setContacts(e.target.value.split(","))}
+							onChange={(e) => handleInputChange(index, e.target.value)}
-						placeholder="Write down an emergency contact"
+							placeholder={`Contact ${index + 1}`}
 							className="border border-gray-300 rounded-md p-2"
 						/>
-
+					))}
 					<button
-						onClick={() => {
+						onClick={addContactInput}
-							alert("Adding contact...");
+						className="bg-emerald-500 text-white
-							let elem = document.getElementsByClassName(
+						font-semibold font-lg rounded-md p-2"
 								"text-input"
 							)[0] as HTMLElement;
 							console.log("Element:", elem);
 							let d = elem.cloneNode(true) as HTMLElement;
 							document.getElementById("Contacts")?.appendChild(d);
 						}}
 						className="bg-emerald-500 text-fuchsia-300"
 						type="button"
 					>
 						Add