From a55b3f52a45636fab9377fb24c4dcdbfc8adc30d Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 08:04:16 -0400 Subject: [PATCH 01/10] Demo Fixes 13 --- Backend/server.py | 77 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index 93fac92..1754a1b 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -13,11 +13,6 @@ import requests import huggingface_hub from generator import load_csm_1b, Segment -# Force CPU mode regardless of what's available -# This bypasses the CUDA/cuDNN library requirements -os.environ["CUDA_VISIBLE_DEVICES"] = "" # Hide all CUDA devices -torch.backends.cudnn.enabled = False # Disable cuDNN - # Configure environment with longer timeouts os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads requests.adapters.DEFAULT_TIMEOUT = 60 # Increase default requests timeout @@ -29,10 +24,55 @@ app = Flask(__name__) app.config['SECRET_KEY'] = 'your-secret-key' socketio = SocketIO(app, cors_allowed_origins="*") -# Force CPU regardless of what hardware is available -device = "cuda" if torch.cuda.is_available() else "cpu" -whisper_compute_type = "int8" -print(f"Forcing CPU mode for all models") +# Explicitly check for CUDA and print more detailed info +print("\n=== CUDA Information ===") +if torch.cuda.is_available(): + print(f"CUDA is available") + print(f"CUDA version: {torch.version.cuda}") + print(f"Number of GPUs: {torch.cuda.device_count()}") + for i in range(torch.cuda.device_count()): + print(f"GPU {i}: {torch.cuda.get_device_name(i)}") +else: + print("CUDA is not available") + +# Check for cuDNN +try: + import ctypes + ctypes.CDLL("libcudnn_ops_infer.so.8") + print("cuDNN is available") +except: + print("cuDNN is not available (libcudnn_ops_infer.so.8 not found)") + +# Check for other compute platforms +if torch.backends.mps.is_available(): + print("MPS (Apple Silicon) is available") +else: + print("MPS is not available") +print("========================\n") + +# Check for CUDA availability and handle potential CUDA/cuDNN issues +try: + if torch.cuda.is_available(): + # Try to initialize CUDA to check if libraries are properly loaded + _ = torch.zeros(1).cuda() + device = "cuda" + whisper_compute_type = "float16" + print("🟢 CUDA is available and initialized successfully") + elif torch.backends.mps.is_available(): + device = "mps" + whisper_compute_type = "float32" + print("🟢 MPS is available (Apple Silicon)") + else: + device = "cpu" + whisper_compute_type = "int8" + print("🟡 Using CPU (CUDA/MPS not available)") +except Exception as e: + print(f"🔴 Error initializing CUDA: {e}") + print("🔴 Falling back to CPU") + device = "cpu" + whisper_compute_type = "int8" + +print(f"Using device: {device}") # Initialize models with proper error handling whisper_model = None @@ -45,10 +85,10 @@ def load_models(): # Initialize Faster-Whisper for transcription try: - print("Loading Whisper model on CPU...") + print("Loading Whisper model...") # Import here to avoid immediate import errors if package is missing from faster_whisper import WhisperModel - whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8", download_root="./models/whisper") + whisper_model = WhisperModel("base", device=device, compute_type=whisper_compute_type, download_root="./models/whisper") print("Whisper model loaded successfully") except Exception as e: print(f"Error loading Whisper model: {e}") @@ -56,8 +96,8 @@ def load_models(): # Initialize CSM model for audio generation try: - print("Loading CSM model on CPU...") - csm_generator = load_csm_1b(device="cpu") + print("Loading CSM model...") + csm_generator = load_csm_1b(device=device) print("CSM model loaded successfully") except Exception as e: print(f"Error loading CSM model: {e}") @@ -65,13 +105,15 @@ def load_models(): # Initialize Llama 3.2 model for response generation try: - print("Loading Llama 3.2 model on CPU...") + print("Loading Llama 3.2 model...") llm_model_id = "meta-llama/Llama-3.2-1B" # Choose appropriate size based on resources llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama") + # Use the right data type based on device + dtype = torch.bfloat16 if device != "cpu" else torch.float32 llm_model = AutoModelForCausalLM.from_pretrained( llm_model_id, - torch_dtype=torch.float32, # Use float32 on CPU - device_map="cpu", + torch_dtype=dtype, + device_map=device, cache_dir="./models/llama", low_cpu_mem_usage=True ) @@ -358,8 +400,7 @@ if __name__ == '__main__': os.rename('index.html', 'templates/index.html') # Load models asynchronously before starting the server - print("Starting CPU-only model loading...") - # In a production environment, you could load models in a separate thread + print("Starting model loading...") load_models() # Start the server From 12383d5e8b7cddf032e8b3efe1e69441a98a1543 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 08:36:50 -0400 Subject: [PATCH 02/10] Demo Fixes 14 --- Backend/index.html | 359 ++++++++++++++++++++++++++++++++++++++++++++- Backend/server.py | 149 +++++++++++++++---- 2 files changed, 475 insertions(+), 33 deletions(-) diff --git a/Backend/index.html b/Backend/index.html index 359ed41..1565977 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -175,12 +175,124 @@ margin-top: 4px; text-align: right; } + + .text-only-indicator { + font-size: 0.8em; + color: #e74c3c; + margin-top: 4px; + font-style: italic; + } + + .status-message { + text-align: center; + padding: 8px; + margin: 10px 0; + background-color: #f8f9fa; + border-radius: 5px; + color: #666; + font-size: 0.9em; + } + + /* Audio visualizer styles */ + .visualizer-container { + width: 100%; + height: 120px; + margin: 15px 0; + border-radius: 10px; + overflow: hidden; + background-color: #000; + position: relative; + } + + #visualizer { + width: 100%; + height: 100%; + display: block; + } + + .visualizer-label { + position: absolute; + top: 10px; + left: 10px; + color: white; + font-size: 0.8em; + background-color: rgba(0, 0, 0, 0.5); + padding: 4px 8px; + border-radius: 4px; + } + + /* Modern switch for visualizer toggle */ + .switch-container { + display: flex; + align-items: center; + justify-content: center; + margin-bottom: 10px; + } + + .switch { + position: relative; + display: inline-block; + width: 50px; + height: 24px; + margin-left: 10px; + } + + .switch input { + opacity: 0; + width: 0; + height: 0; + } + + .slider { + position: absolute; + cursor: pointer; + top: 0; + left: 0; + right: 0; + bottom: 0; + background-color: #ccc; + transition: .4s; + border-radius: 24px; + } + + .slider:before { + position: absolute; + content: ""; + height: 16px; + width: 16px; + left: 4px; + bottom: 4px; + background-color: white; + transition: .4s; + border-radius: 50%; + } + + input:checked + .slider { + background-color: #4CAF50; + } + + input:checked + .slider:before { + transform: translateX(26px); + }

Voice Assistant with CSM & Whisper

+
+ Audio Visualizer + +
+ +
+ +
Listening...
+
+
@@ -201,27 +313,80 @@ const conversation = document.getElementById('conversation'); const status = document.getElementById('status'); const audioWave = document.getElementById('audioWave'); + const visualizerToggle = document.getElementById('visualizerToggle'); + const visualizerContainer = document.getElementById('visualizerContainer'); + const visualizerLabel = document.getElementById('visualizerLabel'); + const canvas = document.getElementById('visualizer'); + const canvasCtx = canvas.getContext('2d'); let mediaRecorder; let audioChunks = []; let isRecording = false; let audioSendInterval; let sessionActive = false; + let reconnectAttempts = 0; + let audioStream = null; + let audioAnalyser = null; + let visualizerActive = true; + let visualizerAnimationId = null; + let audioBufferSource = null; // Initialize audio context const audioContext = new (window.AudioContext || window.webkitAudioContext)(); + // Set up canvas size + function setupCanvas() { + canvas.width = visualizerContainer.offsetWidth; + canvas.height = visualizerContainer.offsetHeight; + } + + // Handle visualizer toggle + visualizerToggle.addEventListener('change', function() { + visualizerActive = this.checked; + visualizerContainer.style.display = visualizerActive ? 'block' : 'none'; + + if (!visualizerActive && visualizerAnimationId) { + cancelAnimationFrame(visualizerAnimationId); + visualizerAnimationId = null; + } else if (visualizerActive && audioAnalyser) { + drawVisualizer(); + } + }); + // Connect to server socket.on('connect', () => { status.textContent = 'Connected to server'; sessionActive = true; + reconnectAttempts = 0; + + if (conversation.children.length > 0) { + addStatusMessage("Reconnected to server"); + } }); socket.on('disconnect', () => { status.textContent = 'Disconnected from server'; sessionActive = false; + + addStatusMessage("Disconnected from server. Attempting to reconnect..."); + + // Attempt to reconnect + tryReconnect(); }); + function tryReconnect() { + if (reconnectAttempts < 5) { + reconnectAttempts++; + setTimeout(() => { + if (!sessionActive) { + socket.connect(); + } + }, 1000 * reconnectAttempts); + } else { + addStatusMessage("Failed to reconnect. Please refresh the page."); + } + } + socket.on('ready', (data) => { status.textContent = data.message; setupAudioRecording(); @@ -230,34 +395,87 @@ socket.on('transcription', (data) => { addMessage('user', data.text); status.textContent = 'Assistant is thinking...'; + visualizerLabel.textContent = 'Processing...'; }); socket.on('audio_response', (data) => { // Play audio status.textContent = 'Playing response...'; + visualizerLabel.textContent = 'Assistant speaking...'; + + // Create audio element const audio = new Audio('data:audio/wav;base64,' + data.audio); + // Visualize assistant audio if visualizer is active + if (visualizerActive) { + visualizeResponseAudio(audio); + } + audio.onended = () => { status.textContent = 'Ready to record'; + visualizerLabel.textContent = 'Listening...'; + if (audioBufferSource) { + audioBufferSource.disconnect(); + audioBufferSource = null; + } }; audio.onerror = () => { status.textContent = 'Error playing audio'; + visualizerLabel.textContent = 'Listening...'; console.error('Error playing audio response'); }; audio.play().catch(err => { status.textContent = 'Error playing audio: ' + err.message; + visualizerLabel.textContent = 'Listening...'; console.error('Error playing audio:', err); }); // Display text - addMessage('bot', data.text); + addMessage('bot', data.text, false); + }); + + // Visualize response audio + async function visualizeResponseAudio(audioElement) { + try { + // Create media element source + const audioSource = audioContext.createMediaElementSource(audioElement); + + // Create analyser + const analyser = audioContext.createAnalyser(); + analyser.fftSize = 2048; + + // Connect + audioSource.connect(analyser); + analyser.connect(audioContext.destination); + + // Store reference + audioAnalyser = analyser; + + // Start visualization + drawVisualizer(); + } catch (e) { + console.error('Error setting up audio visualization:', e); + } + } + + // Handle text-only responses when audio generation isn't available + socket.on('text_response', (data) => { + status.textContent = 'Received text response'; + visualizerLabel.textContent = 'Text only (no audio)'; + addMessage('bot', data.text, true); + setTimeout(() => { + status.textContent = 'Ready to record'; + visualizerLabel.textContent = 'Listening...'; + }, 1000); }); socket.on('error', (data) => { status.textContent = 'Error: ' + data.message; + visualizerLabel.textContent = 'Error occurred'; console.error('Server error:', data.message); + addStatusMessage("Error: " + data.message); }); function setupAudioRecording() { @@ -267,9 +485,29 @@ return; } + // Set up canvas + setupCanvas(); + // Get user media navigator.mediaDevices.getUserMedia({ audio: true }) .then(stream => { + // Store stream for visualizer + audioStream = stream; + + // Create audio analyser for visualization + const source = audioContext.createMediaStreamSource(stream); + const analyser = audioContext.createAnalyser(); + analyser.fftSize = 2048; + source.connect(analyser); + + // Store analyser for visualization + audioAnalyser = analyser; + + // Start visualizer if enabled + if (visualizerActive) { + drawVisualizer(); + } + // Setup recording with better audio quality const options = { mimeType: 'audio/webm', @@ -293,12 +531,6 @@ processRecording(); }; - // Create audio analyzer for visualization - const source = audioContext.createMediaStreamSource(stream); - const analyzer = audioContext.createAnalyser(); - analyzer.fftSize = 2048; - source.connect(analyzer); - // Setup button handlers with better touch handling recordButton.addEventListener('mousedown', startRecording); recordButton.addEventListener('touchstart', (e) => { @@ -322,6 +554,57 @@ }); } + // Draw visualizer animation + function drawVisualizer() { + if (!visualizerActive || !audioAnalyser) { + return; + } + + visualizerAnimationId = requestAnimationFrame(drawVisualizer); + + const bufferLength = audioAnalyser.frequencyBinCount; + const dataArray = new Uint8Array(bufferLength); + + // Get frequency data + audioAnalyser.getByteFrequencyData(dataArray); + + // Clear canvas + canvasCtx.fillStyle = '#000'; + canvasCtx.fillRect(0, 0, canvas.width, canvas.height); + + // Draw visualization based on audio data + const barWidth = (canvas.width / bufferLength) * 2.5; + let x = 0; + + // Choose color based on state + let gradient; + if (isRecording) { + // Red gradient for recording + gradient = canvasCtx.createLinearGradient(0, 0, 0, canvas.height); + gradient.addColorStop(0, 'rgba(255, 0, 0, 0.8)'); + gradient.addColorStop(1, 'rgba(255, 80, 80, 0.2)'); + } else if (visualizerLabel.textContent === 'Assistant speaking...') { + // Blue gradient for assistant + gradient = canvasCtx.createLinearGradient(0, 0, 0, canvas.height); + gradient.addColorStop(0, 'rgba(0, 120, 255, 0.8)'); + gradient.addColorStop(1, 'rgba(80, 160, 255, 0.2)'); + } else { + // Green gradient for listening + gradient = canvasCtx.createLinearGradient(0, 0, 0, canvas.height); + gradient.addColorStop(0, 'rgba(0, 200, 80, 0.8)'); + gradient.addColorStop(1, 'rgba(80, 255, 120, 0.2)'); + } + + for (let i = 0; i < bufferLength; i++) { + const barHeight = (dataArray[i] / 255) * canvas.height; + + canvasCtx.fillStyle = gradient; + canvasCtx.fillRect(x, canvas.height - barHeight, barWidth, barHeight); + + x += barWidth + 1; + } + } + function startRecording() { if (!isRecording && sessionActive) { audioChunks = []; @@ -329,6 +612,7 @@ recordButton.classList.add('recording'); recordButton.textContent = 'Release to Stop'; status.textContent = 'Recording...'; + visualizerLabel.textContent = 'Recording...'; audioWave.classList.remove('hidden'); isRecording = true; @@ -350,6 +634,7 @@ recordButton.classList.remove('recording'); recordButton.textContent = 'Hold to Speak'; status.textContent = 'Processing speech...'; + visualizerLabel.textContent = 'Processing...'; audioWave.classList.add('hidden'); isRecording = false; } @@ -358,6 +643,7 @@ function processRecording() { if (audioChunks.length === 0) { status.textContent = 'No audio recorded'; + visualizerLabel.textContent = 'Listening...'; return; } @@ -380,11 +666,13 @@ } catch (e) { console.error('Error processing audio:', e); status.textContent = 'Error processing audio'; + visualizerLabel.textContent = 'Error'; } }; fileReader.onerror = () => { status.textContent = 'Error reading audio data'; + visualizerLabel.textContent = 'Error'; }; fileReader.readAsArrayBuffer(audioBlob); @@ -403,7 +691,7 @@ return float32Array; } - function addMessage(sender, text) { + function addMessage(sender, text, textOnly = false) { const containerDiv = document.createElement('div'); containerDiv.className = sender === 'user' ? 'message-container user-message-container' : 'message-container bot-message-container'; @@ -422,12 +710,39 @@ infoDiv.className = 'transcription-info'; infoDiv.textContent = 'Transcribed with Whisper'; containerDiv.appendChild(infoDiv); + } else if (textOnly) { + // Add indicator for text-only response + const textOnlyDiv = document.createElement('div'); + textOnlyDiv.className = 'text-only-indicator'; + textOnlyDiv.textContent = 'Text-only response (audio unavailable)'; + containerDiv.appendChild(textOnlyDiv); } conversation.appendChild(containerDiv); conversation.scrollTop = conversation.scrollHeight; } + function addStatusMessage(message) { + const statusDiv = document.createElement('div'); + statusDiv.className = 'status-message'; + statusDiv.textContent = message; + conversation.appendChild(statusDiv); + conversation.scrollTop = conversation.scrollHeight; + + // Auto-remove status messages after 10 seconds + setTimeout(() => { + if (conversation.contains(statusDiv)) { + statusDiv.style.opacity = '0'; + statusDiv.style.transition = 'opacity 0.5s'; + setTimeout(() => { + if (conversation.contains(statusDiv)) { + conversation.removeChild(statusDiv); + } + }, 500); + } + }, 10000); + } + function arrayBufferToBase64(buffer) { let binary = ''; const bytes = new Uint8Array(buffer); @@ -450,6 +765,34 @@ if (socket && socket.connected) { socket.disconnect(); } + + if (visualizerAnimationId) { + cancelAnimationFrame(visualizerAnimationId); + } + }); + + // Add a reload button for debugging + const reloadButton = document.createElement('button'); + reloadButton.textContent = '🔄 Reload'; + reloadButton.style.position = 'fixed'; + reloadButton.style.bottom = '10px'; + reloadButton.style.right = '10px'; + reloadButton.style.padding = '5px 10px'; + reloadButton.style.fontSize = '12px'; + reloadButton.style.backgroundColor = '#f5f5f5'; + reloadButton.style.border = '1px solid #ddd'; + reloadButton.style.borderRadius = '4px'; + reloadButton.style.cursor = 'pointer'; + + reloadButton.addEventListener('click', () => { + window.location.reload(); + }); + + document.body.appendChild(reloadButton); + + // Handle window resize to update canvas size + window.addEventListener('resize', () => { + setupCanvas(); }); diff --git a/Backend/server.py b/Backend/server.py index 1754a1b..85bf97b 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -12,6 +12,10 @@ from collections import deque import requests import huggingface_hub from generator import load_csm_1b, Segment +import threading +import queue +from flask import stream_with_context, Response +import time # Configure environment with longer timeouts os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads @@ -124,6 +128,8 @@ def load_models(): # Store conversation context conversation_context = {} # session_id -> context +CHUNK_SIZE = 24000 # Number of audio samples per chunk (1 second at 24kHz) +audio_stream_queues = {} # session_id -> queue for audio chunks @app.route('/') def index(): @@ -144,8 +150,14 @@ def handle_connect(): @socketio.on('disconnect') def handle_disconnect(): print(f"Client disconnected: {request.sid}") - if request.sid in conversation_context: - del conversation_context[request.sid] + session_id = request.sid + + # Clean up resources + if session_id in conversation_context: + del conversation_context[session_id] + + if session_id in audio_stream_queues: + del audio_stream_queues[session_id] @socketio.on('start_speaking') def handle_start_speaking(): @@ -191,7 +203,7 @@ def is_silence(audio_tensor, threshold=0.02): return torch.mean(torch.abs(audio_tensor)) < threshold def process_user_utterance(session_id): - """Process completed user utterance, generate response and send audio back""" + """Process completed user utterance, generate response and stream audio back""" context = conversation_context[session_id] if not context['audio_buffer']: @@ -234,37 +246,32 @@ def process_user_utterance(session_id): ) context['segments'].append(user_segment) - # Generate bot response + # Generate bot response text bot_response = generate_llm_response(user_text, context['segments']) print(f"Bot response: {bot_response}") # Send transcribed text to client emit('transcription', {'text': user_text}, room=session_id) - # Generate and send audio response if CSM is available + # Generate and stream audio response if CSM is available if csm_generator is not None: - # Convert to audio using CSM - bot_audio = generate_audio_response(bot_response, context['segments']) + # Set up streaming queue for this session + if session_id not in audio_stream_queues: + audio_stream_queues[session_id] = queue.Queue() + else: + # Clear any existing items in the queue + while not audio_stream_queues[session_id].empty(): + audio_stream_queues[session_id].get() - # Convert audio to base64 for sending over websocket - audio_bytes = io.BytesIO() - torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav") - audio_bytes.seek(0) - audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') + # Start audio generation in a separate thread to not block the server + threading.Thread( + target=generate_and_stream_audio, + args=(bot_response, context['segments'], session_id), + daemon=True + ).start() - # Add bot response to conversation history - bot_segment = Segment( - text=bot_response, - speaker=1, # Bot is speaker 1 - audio=bot_audio - ) - context['segments'].append(bot_segment) - - # Send audio response to client - emit('audio_response', { - 'audio': audio_b64, - 'text': bot_response - }, room=session_id) + # Initial response with text + emit('start_streaming_response', {'text': bot_response}, room=session_id) else: # Send text-only response if audio generation isn't available emit('text_response', {'text': bot_response}, room=session_id) @@ -391,6 +398,98 @@ def generate_audio_response(text, conversation_segments): # Return silence as fallback return torch.zeros(csm_generator.sample_rate * 3) # 3 seconds of silence +def generate_and_stream_audio(text, conversation_segments, session_id): + """Generate audio response using CSM and stream it in chunks""" + try: + # Use the last few conversation segments as context + context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments + + # Generate full audio for bot response + audio = csm_generator.generate( + text=text, + speaker=1, # Bot is speaker 1 + context=context_segments, + max_audio_length_ms=10000, # 10 seconds max + temperature=0.9, + topk=50 + ) + + # Store the full audio for conversation history + bot_segment = Segment( + text=text, + speaker=1, # Bot is speaker 1 + audio=audio + ) + if session_id in conversation_context: + conversation_context[session_id]['segments'].append(bot_segment) + + # Split audio into chunks for streaming + chunk_size = CHUNK_SIZE + for i in range(0, len(audio), chunk_size): + chunk = audio[i:i+chunk_size] + + # Convert audio chunk to base64 for streaming + audio_bytes = io.BytesIO() + torchaudio.save(audio_bytes, chunk.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav") + audio_bytes.seek(0) + audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') + + # Send the chunk to the client + if session_id in audio_stream_queues: + audio_stream_queues[session_id].put({ + 'audio': audio_b64, + 'is_last': i + chunk_size >= len(audio) + }) + else: + # Session was disconnected before we finished generating + break + + # Signal the end of streaming if queue still exists + if session_id in audio_stream_queues: + # Add an empty chunk as a sentinel to signal end of streaming + audio_stream_queues[session_id].put(None) + + except Exception as e: + print(f"Error generating or streaming audio: {e}") + # Send error message to client + if session_id in conversation_context: + socketio.emit('error', { + 'message': f'Error generating audio: {str(e)}' + }, room=session_id) + + # Send a final message to unblock the client + if session_id in audio_stream_queues: + audio_stream_queues[session_id].put(None) + +@socketio.on('request_audio_chunk') +def handle_request_audio_chunk(): + """Send the next audio chunk in the queue to the client""" + session_id = request.sid + + if session_id not in audio_stream_queues: + emit('error', {'message': 'No audio stream available'}) + return + + # Get the next chunk or wait for it to be available + try: + if not audio_stream_queues[session_id].empty(): + chunk = audio_stream_queues[session_id].get(block=False) + + # If chunk is None, we're done streaming + if chunk is None: + emit('end_streaming') + # Clean up the queue + if session_id in audio_stream_queues: + del audio_stream_queues[session_id] + else: + emit('audio_chunk', chunk) + else: + # If the queue is empty but we're still generating, tell client to wait + emit('wait_for_chunk') + except Exception as e: + print(f"Error sending audio chunk: {e}") + emit('error', {'message': f'Error streaming audio: {str(e)}'}) + if __name__ == '__main__': # Ensure the existing index.html file is in the correct location if not os.path.exists('templates'): From 7fd520fd8685e1ee292f8c8ef837ca8c1963b8d2 Mon Sep 17 00:00:00 2001 From: BGV <26331505+bgv2@users.noreply.github.com> Date: Sun, 30 Mar 2025 08:40:02 -0400 Subject: [PATCH 03/10] change name of site --- React/src/app/layout.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/React/src/app/layout.tsx b/React/src/app/layout.tsx index 43900a8..8dccd6b 100644 --- a/React/src/app/layout.tsx +++ b/React/src/app/layout.tsx @@ -13,8 +13,8 @@ const geistMono = Geist_Mono({ }); export const metadata: Metadata = { - title: "Create Next App", - description: "Generated by create next app", + title: "Fauxcall", + description: "Fauxcall is a fake call app that helps you get out of awkward situations.", }; export default function RootLayout({ From d2bc4731f7b2c76d666bf82e00be9782fb19e229 Mon Sep 17 00:00:00 2001 From: BGV <26331505+bgv2@users.noreply.github.com> Date: Sun, 30 Mar 2025 08:40:31 -0400 Subject: [PATCH 04/10] fix multiple emergency contact imput --- React/src/app/page.tsx | 63 ++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/React/src/app/page.tsx b/React/src/app/page.tsx index 21e0862..f34f37b 100644 --- a/React/src/app/page.tsx +++ b/React/src/app/page.tsx @@ -3,7 +3,7 @@ import { useState, useEffect } from "react"; import { useRouter } from "next/navigation"; export default function Home() { - const [contacts, setContacts] = useState([]); + const [contacts, setContacts] = useState([""]); const [codeword, setCodeword] = useState(""); const [session, setSession] = useState(null); const [loading, setLoading] = useState(true); @@ -23,6 +23,16 @@ export default function Home() { }); }, []); + const handleInputChange = (index: number, value: string) => { + const updatedContacts = [...contacts]; + updatedContacts[index] = value; // Update the specific input value + setContacts(updatedContacts); + }; + + const addContactInput = () => { + setContacts([...contacts, ""]); // Add a new empty input + }; + function saveToDB() { alert("Saving contacts..."); const contactInputs = document.querySelectorAll( @@ -181,45 +191,20 @@ export default function Home() { className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()} > - setContacts(e.target.value.split(","))} - placeholder="Write down an emergency contact" - className="border border-gray-300 rounded-md p-2" - /> - setContacts(e.target.value.split(","))} - placeholder="Write down an emergency contact" - className="border border-gray-300 rounded-md p-2" - /> - setContacts(e.target.value.split(","))} - placeholder="Write down an emergency contact" - className="border border-gray-300 rounded-md p-2" - /> - setContacts(e.target.value.split(","))} - placeholder="Write down an emergency contact" - className="text-input border border-gray-300 rounded-md p-2" - /> + {contacts.map((contact, index) => ( + handleInputChange(index, e.target.value)} + placeholder={`Contact ${index + 1}`} + className="border border-gray-300 rounded-md p-2" + /> + ))} - - -
Connecting to server...
\ No newline at end of file diff --git a/Backend/server.py b/Backend/server.py index 2b8e126..05abd99 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -14,8 +14,8 @@ import huggingface_hub from generator import load_csm_1b, Segment import threading import queue -from flask import stream_with_context, Response -import time +import asyncio +import json # Configure environment with longer timeouts os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads @@ -26,7 +26,7 @@ os.makedirs("models", exist_ok=True) app = Flask(__name__) app.config['SECRET_KEY'] = 'your-secret-key' -socketio = SocketIO(app, cors_allowed_origins="*") +socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') # Explicitly check for CUDA and print more detailed info print("\n=== CUDA Information ===") @@ -128,8 +128,7 @@ def load_models(): # Store conversation context conversation_context = {} # session_id -> context -CHUNK_SIZE = 24000 # Number of audio samples per chunk (1 second at 24kHz) -audio_stream_queues = {} # session_id -> queue for audio chunks +active_audio_streams = {} # session_id -> stream status @app.route('/') def index(): @@ -143,9 +142,14 @@ def handle_connect(): 'speakers': [0, 1], # 0 = user, 1 = bot 'audio_buffer': deque(maxlen=10), # Store recent audio chunks 'is_speaking': False, - 'silence_start': None + 'last_activity': time.time(), + 'active_session': True, + 'transcription_buffer': [] # For real-time transcription } - emit('ready', {'message': 'Connection established'}) + emit('ready', { + 'message': 'Connection established', + 'sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000 + }) @socketio.on('disconnect') def handle_disconnect(): @@ -154,56 +158,130 @@ def handle_disconnect(): # Clean up resources if session_id in conversation_context: + conversation_context[session_id]['active_session'] = False del conversation_context[session_id] - if session_id in audio_stream_queues: - del audio_stream_queues[session_id] + if session_id in active_audio_streams: + active_audio_streams[session_id]['active'] = False + del active_audio_streams[session_id] -@socketio.on('start_speaking') -def handle_start_speaking(): - if request.sid in conversation_context: - conversation_context[request.sid]['is_speaking'] = True - conversation_context[request.sid]['audio_buffer'].clear() - print(f"User {request.sid} started speaking") - -@socketio.on('audio_chunk') -def handle_audio_chunk(data): - if request.sid not in conversation_context: +@socketio.on('audio_stream') +def handle_audio_stream(data): + """Handle incoming audio stream from client""" + session_id = request.sid + + if session_id not in conversation_context: return - context = conversation_context[request.sid] + context = conversation_context[session_id] + context['last_activity'] = time.time() - # Decode audio data - audio_data = base64.b64decode(data['audio']) - audio_numpy = np.frombuffer(audio_data, dtype=np.float32) - audio_tensor = torch.tensor(audio_numpy) + # Process different stream events + if data.get('event') == 'start': + # Client is starting to send audio + context['is_speaking'] = True + context['audio_buffer'].clear() + context['transcription_buffer'] = [] + print(f"User {session_id} started streaming audio") + + # If AI was speaking, interrupt it + if session_id in active_audio_streams and active_audio_streams[session_id]['active']: + active_audio_streams[session_id]['active'] = False + emit('ai_stream_interrupt', {}, room=session_id) - # Add to buffer - context['audio_buffer'].append(audio_tensor) + elif data.get('event') == 'data': + # Audio data received + if not context['is_speaking']: + return + + # Decode audio chunk + try: + audio_data = base64.b64decode(data.get('audio', '')) + if not audio_data: + return + + audio_numpy = np.frombuffer(audio_data, dtype=np.float32) + + # Apply a simple noise gate + if np.mean(np.abs(audio_numpy)) < 0.01: # Very quiet + return + + audio_tensor = torch.tensor(audio_numpy) + + # Add to audio buffer + context['audio_buffer'].append(audio_tensor) + + # Real-time transcription (periodic) + if len(context['audio_buffer']) % 3 == 0: # Process every 3 chunks + threading.Thread( + target=process_realtime_transcription, + args=(session_id,), + daemon=True + ).start() + except Exception as e: + print(f"Error processing audio chunk: {e}") - # Check for silence to detect end of speech - if context['is_speaking'] and is_silence(audio_tensor): - if context['silence_start'] is None: - context['silence_start'] = time.time() - elif time.time() - context['silence_start'] > 1.0: # 1 second of silence + elif data.get('event') == 'end': + # Client has finished sending audio + context['is_speaking'] = False + + if len(context['audio_buffer']) > 0: # Process the complete utterance - process_user_utterance(request.sid) - else: - context['silence_start'] = None + threading.Thread( + target=process_complete_utterance, + args=(session_id,), + daemon=True + ).start() + + print(f"User {session_id} stopped streaming audio") -@socketio.on('stop_speaking') -def handle_stop_speaking(): - if request.sid in conversation_context: - conversation_context[request.sid]['is_speaking'] = False - process_user_utterance(request.sid) - print(f"User {request.sid} stopped speaking") +def process_realtime_transcription(session_id): + """Process incoming audio for real-time transcription""" + if session_id not in conversation_context or not conversation_context[session_id]['active_session']: + return + + context = conversation_context[session_id] + + if not context['audio_buffer'] or not context['is_speaking']: + return + + try: + # Combine current buffer for transcription + buffer_copy = list(context['audio_buffer']) + if not buffer_copy: + return + + full_audio = torch.cat(buffer_copy, dim=0) + + # Save audio to temporary WAV file for transcription + temp_audio_path = f"temp_rt_{session_id}.wav" + torchaudio.save( + temp_audio_path, + full_audio.unsqueeze(0), + 44100 # Assuming 44.1kHz from client + ) + + # Transcribe with Whisper if available + if whisper_model is not None: + segments, _ = whisper_model.transcribe(temp_audio_path, beam_size=5) + text = " ".join([segment.text for segment in segments]) + + if text.strip(): + context['transcription_buffer'].append(text) + # Send partial transcription to client + emit('partial_transcription', {'text': text}, room=session_id) + except Exception as e: + print(f"Error in realtime transcription: {e}") + finally: + # Clean up + if os.path.exists(temp_audio_path): + os.remove(temp_audio_path) -def is_silence(audio_tensor, threshold=0.02): - """Check if an audio chunk is silence based on amplitude threshold""" - return torch.mean(torch.abs(audio_tensor)) < threshold - -def process_user_utterance(session_id): +def process_complete_utterance(session_id): """Process completed user utterance, generate response and stream audio back""" + if session_id not in conversation_context or not conversation_context[session_id]['active_session']: + return + context = conversation_context[session_id] if not context['audio_buffer']: @@ -212,8 +290,6 @@ def process_user_utterance(session_id): # Combine audio chunks full_audio = torch.cat(list(context['audio_buffer']), dim=0) context['audio_buffer'].clear() - context['is_speaking'] = False - context['silence_start'] = None # Save audio to temporary WAV file for transcription temp_audio_path = f"temp_audio_{session_id}.wav" @@ -255,23 +331,23 @@ def process_user_utterance(session_id): # Generate and stream audio response if CSM is available if csm_generator is not None: - # Set up streaming queue for this session - if session_id not in audio_stream_queues: - audio_stream_queues[session_id] = queue.Queue() - else: - # Clear any existing items in the queue - while not audio_stream_queues[session_id].empty(): - audio_stream_queues[session_id].get() + # Create stream state object + active_audio_streams[session_id] = { + 'active': True, + 'text': bot_response + } - # Start audio generation in a separate thread to not block the server + # Send initial response to prepare client + emit('ai_stream_start', { + 'text': bot_response + }, room=session_id) + + # Start audio generation in a separate thread threading.Thread( - target=generate_and_stream_audio, + target=generate_and_stream_audio_realtime, args=(bot_response, context['segments'], session_id), daemon=True ).start() - - # Initial response with text - emit('start_streaming_response', {'text': bot_response}, room=session_id) else: # Send text-only response if audio generation isn't available emit('text_response', {'text': bot_response}, room=session_id) @@ -378,8 +454,11 @@ def fallback_response(user_text): else: return "I understand you said something about that. Unfortunately, I'm running in fallback mode with limited capabilities. Please try again later when the main model is available." -def generate_audio_response(text, conversation_segments): - """Generate audio response using CSM""" +def generate_and_stream_audio_realtime(text, conversation_segments, session_id): + """Generate audio response using CSM and stream it in real-time to client""" + if session_id not in active_audio_streams or not active_audio_streams[session_id]['active']: + return + try: # Use the last few conversation segments as context context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments @@ -394,40 +473,23 @@ def generate_audio_response(text, conversation_segments): topk=50 ) - return audio - except Exception as e: - print(f"Error generating audio: {e}") - # Return silence as fallback - return torch.zeros(csm_generator.sample_rate * 3) # 3 seconds of silence - -def generate_and_stream_audio(text, conversation_segments, session_id): - """Generate audio response using CSM and stream it in chunks""" - try: - # Use the last few conversation segments as context - context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments - - # Generate full audio for bot response - audio = csm_generator.generate( - text=text, - speaker=1, # Bot is speaker 1 - context=context_segments, - max_audio_length_ms=10000, # 10 seconds max - temperature=0.9, - topk=50 - ) - # Store the full audio for conversation history bot_segment = Segment( text=text, speaker=1, # Bot is speaker 1 audio=audio ) - if session_id in conversation_context: + if session_id in conversation_context and conversation_context[session_id]['active_session']: conversation_context[session_id]['segments'].append(bot_segment) - # Split audio into chunks for streaming - chunk_size = CHUNK_SIZE + # Stream audio in small chunks for more responsive playback + chunk_size = 4800 # 200ms at 24kHz + for i in range(0, len(audio), chunk_size): + if session_id not in active_audio_streams or not active_audio_streams[session_id]['active']: + print("Audio streaming interrupted or session ended") + break + chunk = audio[i:i+chunk_size] # Convert audio chunk to base64 for streaming @@ -436,61 +498,33 @@ def generate_and_stream_audio(text, conversation_segments, session_id): audio_bytes.seek(0) audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') - # Send the chunk to the client - if session_id in audio_stream_queues: - audio_stream_queues[session_id].put({ - 'audio': audio_b64, - 'is_last': i + chunk_size >= len(audio) - }) - else: - # Session was disconnected before we finished generating - break - - # Signal the end of streaming if queue still exists - if session_id in audio_stream_queues: - # Add an empty chunk as a sentinel to signal end of streaming - audio_stream_queues[session_id].put(None) + # Send chunk to client + socketio.emit('ai_stream_data', { + 'audio': audio_b64, + 'is_last': i + chunk_size >= len(audio) + }, room=session_id) + + # Simulate real-time speech by adding a small delay + # Remove this in production for faster response + time.sleep(0.15) # Slight delay for more natural timing + + # Signal end of stream + if session_id in active_audio_streams and active_audio_streams[session_id]['active']: + socketio.emit('ai_stream_end', {}, room=session_id) + active_audio_streams[session_id]['active'] = False except Exception as e: print(f"Error generating or streaming audio: {e}") # Send error message to client - if session_id in conversation_context: + if session_id in conversation_context and conversation_context[session_id]['active_session']: socketio.emit('error', { 'message': f'Error generating audio: {str(e)}' }, room=session_id) - # Send a final message to unblock the client - if session_id in audio_stream_queues: - audio_stream_queues[session_id].put(None) - -@socketio.on('request_audio_chunk') -def handle_request_audio_chunk(): - """Send the next audio chunk in the queue to the client""" - session_id = request.sid - - if session_id not in audio_stream_queues: - emit('error', {'message': 'No audio stream available'}) - return - - # Get the next chunk or wait for it to be available - try: - if not audio_stream_queues[session_id].empty(): - chunk = audio_stream_queues[session_id].get(block=False) - - # If chunk is None, we're done streaming - if chunk is None: - emit('end_streaming') - # Clean up the queue - if session_id in audio_stream_queues: - del audio_stream_queues[session_id] - else: - emit('audio_chunk', chunk) - else: - # If the queue is empty but we're still generating, tell client to wait - emit('wait_for_chunk') - except Exception as e: - print(f"Error sending audio chunk: {e}") - emit('error', {'message': f'Error streaming audio: {str(e)}'}) + # Signal stream end to unblock client + socketio.emit('ai_stream_end', {}, room=session_id) + if session_id in active_audio_streams: + active_audio_streams[session_id]['active'] = False if __name__ == '__main__': # Ensure the existing index.html file is in the correct location @@ -500,10 +534,10 @@ if __name__ == '__main__': if os.path.exists('index.html') and not os.path.exists('templates/index.html'): os.rename('index.html', 'templates/index.html') - # Load models asynchronously before starting the server + # Load models before starting the server print("Starting model loading...") load_models() - # Start the server + # Start the server with eventlet for better WebSocket performance print("Starting Flask SocketIO server...") socketio.run(app, host='0.0.0.0', port=5000, debug=False) \ No newline at end of file From e69d9c5da1b8b101459db335f45908c86ba4b2b1 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 09:17:31 -0400 Subject: [PATCH 09/10] Demo Update 22 --- Backend/index.html | 266 +++++------------ Backend/server.py | 693 +++++++++++++++++++++++++++------------------ 2 files changed, 496 insertions(+), 463 deletions(-) diff --git a/Backend/index.html b/Backend/index.html index 59b4903..64b4b9c 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -3,7 +3,7 @@ - Live Voice Assistant with CSM + Real-Time Voice Assistant -

Live Voice Assistant with CSM

+

Real-Time Voice Assistant

-
- Audio Visualizer - -
- -
+
Listening...
-
- +
Connecting to server...
+ AI Voice Chat -

Real-Time Voice Assistant

-
- -
- -
Listening...
-
- -
- -
- -
Connecting to server...
+
+
+

AI Voice Assistant

+
- +
+
+
+ Disconnected +
+
+ +
+
+ +
+
Your conversation will appear here.
+
+ + + +
+ + +
+ + + +
+

Status

+
+
+
Whisper Model: Loading...
+
+
+
CSM Audio Model: Loading...
+
+
+
LLM Model: Loading...
+
+
+
WebRTC: Not Connected
+
+
+
+
+ +
+

AI Voice Assistant | Using Fast Whisper, Llama 3.2, and CSM Audio Models

+
+ + + \ No newline at end of file diff --git a/Backend/server.py b/Backend/server.py index bf365fa..af76560 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -149,6 +149,11 @@ def index(): """Serve the main interface""" return render_template('index.html') +@app.route('/voice-chat.js') +def voice_chat_js(): + """Serve the JavaScript for voice chat""" + return app.send_static_file('voice-chat.js') + @socketio.on('connect') def handle_connect(): """Handle new client connection""" diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js new file mode 100644 index 0000000..12bac9a --- /dev/null +++ b/Backend/voice-chat.js @@ -0,0 +1,560 @@ +document.addEventListener('DOMContentLoaded', () => { + // DOM Elements + const startButton = document.getElementById('start-button'); + const interruptButton = document.getElementById('interrupt-button'); + const conversationDiv = document.getElementById('conversation'); + const connectionDot = document.getElementById('connection-dot'); + const connectionStatus = document.getElementById('connection-status'); + const whisperStatus = document.getElementById('whisper-status'); + const csmStatus = document.getElementById('csm-status'); + const llmStatus = document.getElementById('llm-status'); + const webrtcStatus = document.getElementById('webrtc-status'); + const micAnimation = document.getElementById('mic-animation'); + const loadingDiv = document.getElementById('loading'); + const loadingText = document.getElementById('loading-text'); + + // State variables + let socket; + let isConnected = false; + let isListening = false; + let isAiSpeaking = false; + let audioContext; + let mediaStream; + let audioRecorder; + let audioProcessor; + const audioChunks = []; + + // WebRTC variables + let peerConnection; + let dataChannel; + let hasActiveConnection = false; + + // Audio playback + let audioQueue = []; + let isPlaying = false; + + // Configuration variables + let serverSampleRate = 24000; + let clientSampleRate = 44100; + let iceServers = []; + + // Initialize the application + initApp(); + + // Main initialization function + function initApp() { + updateConnectionStatus('connecting'); + setupSocketConnection(); + setupEventListeners(); + } + + // Set up Socket.IO connection with server + function setupSocketConnection() { + socket = io(); + + socket.on('connect', () => { + console.log('Connected to server'); + updateConnectionStatus('connected'); + isConnected = true; + }); + + socket.on('disconnect', () => { + console.log('Disconnected from server'); + updateConnectionStatus('disconnected'); + isConnected = false; + cleanupAudio(); + cleanupWebRTC(); + }); + + socket.on('session_ready', (data) => { + console.log('Session ready:', data); + updateModelStatus(data); + clientSampleRate = data.client_sample_rate; + serverSampleRate = data.server_sample_rate; + iceServers = data.ice_servers; + + // Initialize WebRTC if models are available + if (data.whisper_available && data.llm_available) { + initializeWebRTC(); + } + }); + + socket.on('ready_for_speech', (data) => { + console.log('Ready for speech:', data); + startButton.disabled = false; + addInfoMessage('Ready for conversation. Click "Start Listening" to begin.'); + }); + + socket.on('webrtc_signal', (data) => { + handleWebRTCSignal(data); + }); + + socket.on('transcription', (data) => { + console.log('Transcription:', data); + addUserMessage(data.text); + loadingDiv.style.display = 'none'; + }); + + socket.on('ai_response_text', (data) => { + console.log('AI response text:', data); + addAIMessage(data.text); + loadingDiv.style.display = 'none'; + }); + + socket.on('ai_speech_start', () => { + console.log('AI started speaking'); + isAiSpeaking = true; + interruptButton.disabled = false; + }); + + socket.on('ai_speech_chunk', (data) => { + console.log('Received AI speech chunk'); + playAudioChunk(data.audio, data.is_last); + }); + + socket.on('ai_speech_end', () => { + console.log('AI stopped speaking'); + isAiSpeaking = false; + interruptButton.disabled = true; + }); + + socket.on('user_speech_start', () => { + console.log('User speech detected'); + showSpeakingIndicator(true); + }); + + socket.on('processing_speech', () => { + console.log('Processing speech'); + showSpeakingIndicator(false); + showLoadingIndicator('Processing your speech...'); + }); + + socket.on('no_speech_detected', () => { + console.log('No speech detected'); + hideLoadingIndicator(); + addInfoMessage('No speech detected. Please try again.'); + }); + + socket.on('ai_interrupted', () => { + console.log('AI interrupted'); + clearAudioQueue(); + isAiSpeaking = false; + interruptButton.disabled = true; + }); + + socket.on('ai_interrupted_by_user', () => { + console.log('AI interrupted by user'); + clearAudioQueue(); + isAiSpeaking = false; + interruptButton.disabled = true; + addInfoMessage('AI interrupted by your speech'); + }); + + socket.on('error', (data) => { + console.error('Server error:', data); + hideLoadingIndicator(); + addInfoMessage(`Error: ${data.message}`); + }); + } + + // Set up UI event listeners + function setupEventListeners() { + startButton.addEventListener('click', toggleListening); + interruptButton.addEventListener('click', interruptAI); + } + + // Update UI connection status + function updateConnectionStatus(status) { + connectionDot.className = 'status-dot ' + status; + + switch (status) { + case 'connected': + connectionStatus.textContent = 'Connected'; + break; + case 'connecting': + connectionStatus.textContent = 'Connecting...'; + break; + case 'disconnected': + connectionStatus.textContent = 'Disconnected'; + startButton.disabled = true; + interruptButton.disabled = true; + break; + } + } + + // Update model status indicators + function updateModelStatus(data) { + whisperStatus.textContent = data.whisper_available ? 'Available' : 'Not Available'; + whisperStatus.style.color = data.whisper_available ? 'green' : 'red'; + + csmStatus.textContent = data.csm_available ? 'Available' : 'Not Available'; + csmStatus.style.color = data.csm_available ? 'green' : 'red'; + + llmStatus.textContent = data.llm_available ? 'Available' : 'Not Available'; + llmStatus.style.color = data.llm_available ? 'green' : 'red'; + } + + // Initialize WebRTC connection + function initializeWebRTC() { + if (!isConnected) return; + + const configuration = { + iceServers: iceServers + }; + + peerConnection = new RTCPeerConnection(configuration); + + // Create data channel for WebRTC communication + dataChannel = peerConnection.createDataChannel('audioData', { + ordered: true + }); + + dataChannel.onopen = () => { + console.log('WebRTC data channel open'); + hasActiveConnection = true; + webrtcStatus.textContent = 'Connected'; + webrtcStatus.style.color = 'green'; + socket.emit('webrtc_connected', { status: 'connected' }); + }; + + dataChannel.onclose = () => { + console.log('WebRTC data channel closed'); + hasActiveConnection = false; + webrtcStatus.textContent = 'Disconnected'; + webrtcStatus.style.color = 'red'; + }; + + // Handle ICE candidates + peerConnection.onicecandidate = (event) => { + if (event.candidate) { + socket.emit('webrtc_signal', { + type: 'ice_candidate', + candidate: event.candidate + }); + } + }; + + // Log ICE connection state changes + peerConnection.oniceconnectionstatechange = () => { + console.log('ICE connection state:', peerConnection.iceConnectionState); + }; + + // Create offer + peerConnection.createOffer() + .then(offer => peerConnection.setLocalDescription(offer)) + .then(() => { + socket.emit('webrtc_signal', { + type: 'offer', + sdp: peerConnection.localDescription + }); + }) + .catch(error => { + console.error('Error creating WebRTC offer:', error); + webrtcStatus.textContent = 'Failed to Connect'; + webrtcStatus.style.color = 'red'; + }); + } + + // Handle WebRTC signals from the server + function handleWebRTCSignal(data) { + if (!peerConnection) return; + + if (data.type === 'answer') { + peerConnection.setRemoteDescription(new RTCSessionDescription(data.sdp)) + .catch(error => console.error('Error setting remote description:', error)); + } + else if (data.type === 'ice_candidate') { + peerConnection.addIceCandidate(new RTCIceCandidate(data.candidate)) + .catch(error => console.error('Error adding ICE candidate:', error)); + } + } + + // Clean up WebRTC connection + function cleanupWebRTC() { + if (dataChannel) { + dataChannel.close(); + } + + if (peerConnection) { + peerConnection.close(); + } + + dataChannel = null; + peerConnection = null; + hasActiveConnection = false; + webrtcStatus.textContent = 'Not Connected'; + webrtcStatus.style.color = 'red'; + } + + // Toggle audio listening + function toggleListening() { + if (isListening) { + stopListening(); + } else { + startListening(); + } + } + + // Start listening for audio + async function startListening() { + if (!isConnected) return; + + try { + await initAudio(); + isListening = true; + startButton.textContent = 'Stop Listening'; + startButton.innerHTML = ` + + + + Stop Listening + `; + } catch (error) { + console.error('Error starting audio:', error); + addInfoMessage('Error accessing microphone. Please check permissions.'); + } + } + + // Stop listening for audio + function stopListening() { + cleanupAudio(); + isListening = false; + startButton.innerHTML = ` + + + + Start Listening + `; + showSpeakingIndicator(false); + } + + // Initialize audio capture + async function initAudio() { + // Request microphone access + mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { + sampleRate: clientSampleRate, + channelCount: 1, + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true + } + }); + + // Initialize AudioContext + audioContext = new (window.AudioContext || window.webkitAudioContext)({ + sampleRate: clientSampleRate + }); + + // Create audio source from stream + const source = audioContext.createMediaStreamSource(mediaStream); + + // Create ScriptProcessor for audio processing + const bufferSize = 4096; + audioProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1); + + // Process audio data + audioProcessor.onaudioprocess = (event) => { + if (!isListening || isAiSpeaking) return; + + const input = event.inputBuffer.getChannelData(0); + const audioData = convertFloat32ToInt16(input); + sendAudioChunk(audioData); + }; + + // Connect the nodes + source.connect(audioProcessor); + audioProcessor.connect(audioContext.destination); + } + + // Clean up audio resources + function cleanupAudio() { + if (audioProcessor) { + audioProcessor.disconnect(); + audioProcessor = null; + } + + if (mediaStream) { + mediaStream.getTracks().forEach(track => track.stop()); + mediaStream = null; + } + + if (audioContext && audioContext.state !== 'closed') { + audioContext.close().catch(error => console.error('Error closing AudioContext:', error)); + } + + audioChunks.length = 0; + } + + // Convert Float32Array to Int16Array for sending to server + function convertFloat32ToInt16(float32Array) { + const int16Array = new Int16Array(float32Array.length); + for (let i = 0; i < float32Array.length; i++) { + // Convert float [-1.0, 1.0] to int16 [-32768, 32767] + int16Array[i] = Math.max(-32768, Math.min(32767, Math.floor(float32Array[i] * 32768))); + } + return int16Array; + } + + // Send audio chunk to server + function sendAudioChunk(audioData) { + if (!isConnected || !isListening) return; + + // Convert to base64 for transmission + const base64Audio = arrayBufferToBase64(audioData.buffer); + + // Send via Socket.IO (could use WebRTC's DataChannel for lower latency in production) + socket.emit('audio_stream', { audio: base64Audio }); + } + + // Play audio chunk received from server + function playAudioChunk(base64Audio, isLast) { + const audioData = base64ToArrayBuffer(base64Audio); + + // Add to queue + audioQueue.push({ + data: audioData, + isLast: isLast + }); + + // Start playing if not already playing + if (!isPlaying) { + playNextAudioChunk(); + } + } + + // Play the next audio chunk in the queue + function playNextAudioChunk() { + if (audioQueue.length === 0) { + isPlaying = false; + return; + } + + isPlaying = true; + const chunk = audioQueue.shift(); + + try { + // Create audio context if needed + if (!audioContext || audioContext.state === 'closed') { + audioContext = new (window.AudioContext || window.webkitAudioContext)(); + } + + // Resume audio context if suspended + if (audioContext.state === 'suspended') { + audioContext.resume(); + } + + // Decode the WAV data + audioContext.decodeAudioData(chunk.data, (buffer) => { + const source = audioContext.createBufferSource(); + source.buffer = buffer; + source.connect(audioContext.destination); + + // When playback ends, play the next chunk + source.onended = () => { + playNextAudioChunk(); + }; + + source.start(0); + + // If it's the last chunk, update UI + if (chunk.isLast) { + setTimeout(() => { + isAiSpeaking = false; + interruptButton.disabled = true; + }, buffer.duration * 1000); + } + }, (error) => { + console.error('Error decoding audio data:', error); + playNextAudioChunk(); // Skip this chunk and try the next + }); + } catch (error) { + console.error('Error playing audio chunk:', error); + playNextAudioChunk(); // Try the next chunk + } + } + + // Clear the audio queue (used when interrupting) + function clearAudioQueue() { + audioQueue.length = 0; + isPlaying = false; + + // Stop any currently playing audio + if (audioContext) { + audioContext.suspend(); + } + } + + // Send interrupt signal to server + function interruptAI() { + if (!isConnected || !isAiSpeaking) return; + + socket.emit('interrupt_ai'); + clearAudioQueue(); + } + + // Convert ArrayBuffer to Base64 string + function arrayBufferToBase64(buffer) { + const binary = new Uint8Array(buffer); + let base64 = ''; + const len = binary.byteLength; + for (let i = 0; i < len; i++) { + base64 += String.fromCharCode(binary[i]); + } + return window.btoa(base64); + } + + // Convert Base64 string to ArrayBuffer + function base64ToArrayBuffer(base64) { + const binaryString = window.atob(base64); + const len = binaryString.length; + const bytes = new Uint8Array(len); + for (let i = 0; i < len; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + return bytes.buffer; + } + + // Add user message to conversation + function addUserMessage(text) { + const messageDiv = document.createElement('div'); + messageDiv.className = 'message user-message'; + messageDiv.textContent = text; + conversationDiv.appendChild(messageDiv); + conversationDiv.scrollTop = conversationDiv.scrollHeight; + } + + // Add AI message to conversation + function addAIMessage(text) { + const messageDiv = document.createElement('div'); + messageDiv.className = 'message ai-message'; + messageDiv.textContent = text; + conversationDiv.appendChild(messageDiv); + conversationDiv.scrollTop = conversationDiv.scrollHeight; + } + + // Add info message to conversation + function addInfoMessage(text) { + const messageDiv = document.createElement('div'); + messageDiv.className = 'info-message'; + messageDiv.textContent = text; + conversationDiv.appendChild(messageDiv); + conversationDiv.scrollTop = conversationDiv.scrollHeight; + } + + // Show/hide speaking indicator + function showSpeakingIndicator(show) { + micAnimation.style.display = show ? 'flex' : 'none'; + } + + // Show loading indicator + function showLoadingIndicator(text) { + loadingText.textContent = text || 'Processing...'; + loadingDiv.style.display = 'block'; + } + + // Hide loading indicator + function hideLoadingIndicator() { + loadingDiv.style.display = 'none'; + } +}); \ No newline at end of file