From 12383d5e8b7cddf032e8b3efe1e69441a98a1543 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 08:36:50 -0400 Subject: [PATCH] Demo Fixes 14 --- Backend/index.html | 359 ++++++++++++++++++++++++++++++++++++++++++++- Backend/server.py | 149 +++++++++++++++---- 2 files changed, 475 insertions(+), 33 deletions(-) diff --git a/Backend/index.html b/Backend/index.html index 359ed41..1565977 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -175,12 +175,124 @@ margin-top: 4px; text-align: right; } + + .text-only-indicator { + font-size: 0.8em; + color: #e74c3c; + margin-top: 4px; + font-style: italic; + } + + .status-message { + text-align: center; + padding: 8px; + margin: 10px 0; + background-color: #f8f9fa; + border-radius: 5px; + color: #666; + font-size: 0.9em; + } + + /* Audio visualizer styles */ + .visualizer-container { + width: 100%; + height: 120px; + margin: 15px 0; + border-radius: 10px; + overflow: hidden; + background-color: #000; + position: relative; + } + + #visualizer { + width: 100%; + height: 100%; + display: block; + } + + .visualizer-label { + position: absolute; + top: 10px; + left: 10px; + color: white; + font-size: 0.8em; + background-color: rgba(0, 0, 0, 0.5); + padding: 4px 8px; + border-radius: 4px; + } + + /* Modern switch for visualizer toggle */ + .switch-container { + display: flex; + align-items: center; + justify-content: center; + margin-bottom: 10px; + } + + .switch { + position: relative; + display: inline-block; + width: 50px; + height: 24px; + margin-left: 10px; + } + + .switch input { + opacity: 0; + width: 0; + height: 0; + } + + .slider { + position: absolute; + cursor: pointer; + top: 0; + left: 0; + right: 0; + bottom: 0; + background-color: #ccc; + transition: .4s; + border-radius: 24px; + } + + .slider:before { + position: absolute; + content: ""; + height: 16px; + width: 16px; + left: 4px; + bottom: 4px; + background-color: white; + transition: .4s; + border-radius: 50%; + } + + input:checked + .slider { + background-color: #4CAF50; + } + + input:checked + .slider:before { + transform: translateX(26px); + }

Voice Assistant with CSM & Whisper

+
+ Audio Visualizer + +
+ +
+ +
Listening...
+
+
@@ -201,27 +313,80 @@ const conversation = document.getElementById('conversation'); const status = document.getElementById('status'); const audioWave = document.getElementById('audioWave'); + const visualizerToggle = document.getElementById('visualizerToggle'); + const visualizerContainer = document.getElementById('visualizerContainer'); + const visualizerLabel = document.getElementById('visualizerLabel'); + const canvas = document.getElementById('visualizer'); + const canvasCtx = canvas.getContext('2d'); let mediaRecorder; let audioChunks = []; let isRecording = false; let audioSendInterval; let sessionActive = false; + let reconnectAttempts = 0; + let audioStream = null; + let audioAnalyser = null; + let visualizerActive = true; + let visualizerAnimationId = null; + let audioBufferSource = null; // Initialize audio context const audioContext = new (window.AudioContext || window.webkitAudioContext)(); + // Set up canvas size + function setupCanvas() { + canvas.width = visualizerContainer.offsetWidth; + canvas.height = visualizerContainer.offsetHeight; + } + + // Handle visualizer toggle + visualizerToggle.addEventListener('change', function() { + visualizerActive = this.checked; + visualizerContainer.style.display = visualizerActive ? 'block' : 'none'; + + if (!visualizerActive && visualizerAnimationId) { + cancelAnimationFrame(visualizerAnimationId); + visualizerAnimationId = null; + } else if (visualizerActive && audioAnalyser) { + drawVisualizer(); + } + }); + // Connect to server socket.on('connect', () => { status.textContent = 'Connected to server'; sessionActive = true; + reconnectAttempts = 0; + + if (conversation.children.length > 0) { + addStatusMessage("Reconnected to server"); + } }); socket.on('disconnect', () => { status.textContent = 'Disconnected from server'; sessionActive = false; + + addStatusMessage("Disconnected from server. Attempting to reconnect..."); + + // Attempt to reconnect + tryReconnect(); }); + function tryReconnect() { + if (reconnectAttempts < 5) { + reconnectAttempts++; + setTimeout(() => { + if (!sessionActive) { + socket.connect(); + } + }, 1000 * reconnectAttempts); + } else { + addStatusMessage("Failed to reconnect. Please refresh the page."); + } + } + socket.on('ready', (data) => { status.textContent = data.message; setupAudioRecording(); @@ -230,34 +395,87 @@ socket.on('transcription', (data) => { addMessage('user', data.text); status.textContent = 'Assistant is thinking...'; + visualizerLabel.textContent = 'Processing...'; }); socket.on('audio_response', (data) => { // Play audio status.textContent = 'Playing response...'; + visualizerLabel.textContent = 'Assistant speaking...'; + + // Create audio element const audio = new Audio('data:audio/wav;base64,' + data.audio); + // Visualize assistant audio if visualizer is active + if (visualizerActive) { + visualizeResponseAudio(audio); + } + audio.onended = () => { status.textContent = 'Ready to record'; + visualizerLabel.textContent = 'Listening...'; + if (audioBufferSource) { + audioBufferSource.disconnect(); + audioBufferSource = null; + } }; audio.onerror = () => { status.textContent = 'Error playing audio'; + visualizerLabel.textContent = 'Listening...'; console.error('Error playing audio response'); }; audio.play().catch(err => { status.textContent = 'Error playing audio: ' + err.message; + visualizerLabel.textContent = 'Listening...'; console.error('Error playing audio:', err); }); // Display text - addMessage('bot', data.text); + addMessage('bot', data.text, false); + }); + + // Visualize response audio + async function visualizeResponseAudio(audioElement) { + try { + // Create media element source + const audioSource = audioContext.createMediaElementSource(audioElement); + + // Create analyser + const analyser = audioContext.createAnalyser(); + analyser.fftSize = 2048; + + // Connect + audioSource.connect(analyser); + analyser.connect(audioContext.destination); + + // Store reference + audioAnalyser = analyser; + + // Start visualization + drawVisualizer(); + } catch (e) { + console.error('Error setting up audio visualization:', e); + } + } + + // Handle text-only responses when audio generation isn't available + socket.on('text_response', (data) => { + status.textContent = 'Received text response'; + visualizerLabel.textContent = 'Text only (no audio)'; + addMessage('bot', data.text, true); + setTimeout(() => { + status.textContent = 'Ready to record'; + visualizerLabel.textContent = 'Listening...'; + }, 1000); }); socket.on('error', (data) => { status.textContent = 'Error: ' + data.message; + visualizerLabel.textContent = 'Error occurred'; console.error('Server error:', data.message); + addStatusMessage("Error: " + data.message); }); function setupAudioRecording() { @@ -267,9 +485,29 @@ return; } + // Set up canvas + setupCanvas(); + // Get user media navigator.mediaDevices.getUserMedia({ audio: true }) .then(stream => { + // Store stream for visualizer + audioStream = stream; + + // Create audio analyser for visualization + const source = audioContext.createMediaStreamSource(stream); + const analyser = audioContext.createAnalyser(); + analyser.fftSize = 2048; + source.connect(analyser); + + // Store analyser for visualization + audioAnalyser = analyser; + + // Start visualizer if enabled + if (visualizerActive) { + drawVisualizer(); + } + // Setup recording with better audio quality const options = { mimeType: 'audio/webm', @@ -293,12 +531,6 @@ processRecording(); }; - // Create audio analyzer for visualization - const source = audioContext.createMediaStreamSource(stream); - const analyzer = audioContext.createAnalyser(); - analyzer.fftSize = 2048; - source.connect(analyzer); - // Setup button handlers with better touch handling recordButton.addEventListener('mousedown', startRecording); recordButton.addEventListener('touchstart', (e) => { @@ -322,6 +554,57 @@ }); } + // Draw visualizer animation + function drawVisualizer() { + if (!visualizerActive || !audioAnalyser) { + return; + } + + visualizerAnimationId = requestAnimationFrame(drawVisualizer); + + const bufferLength = audioAnalyser.frequencyBinCount; + const dataArray = new Uint8Array(bufferLength); + + // Get frequency data + audioAnalyser.getByteFrequencyData(dataArray); + + // Clear canvas + canvasCtx.fillStyle = '#000'; + canvasCtx.fillRect(0, 0, canvas.width, canvas.height); + + // Draw visualization based on audio data + const barWidth = (canvas.width / bufferLength) * 2.5; + let x = 0; + + // Choose color based on state + let gradient; + if (isRecording) { + // Red gradient for recording + gradient = canvasCtx.createLinearGradient(0, 0, 0, canvas.height); + gradient.addColorStop(0, 'rgba(255, 0, 0, 0.8)'); + gradient.addColorStop(1, 'rgba(255, 80, 80, 0.2)'); + } else if (visualizerLabel.textContent === 'Assistant speaking...') { + // Blue gradient for assistant + gradient = canvasCtx.createLinearGradient(0, 0, 0, canvas.height); + gradient.addColorStop(0, 'rgba(0, 120, 255, 0.8)'); + gradient.addColorStop(1, 'rgba(80, 160, 255, 0.2)'); + } else { + // Green gradient for listening + gradient = canvasCtx.createLinearGradient(0, 0, 0, canvas.height); + gradient.addColorStop(0, 'rgba(0, 200, 80, 0.8)'); + gradient.addColorStop(1, 'rgba(80, 255, 120, 0.2)'); + } + + for (let i = 0; i < bufferLength; i++) { + const barHeight = (dataArray[i] / 255) * canvas.height; + + canvasCtx.fillStyle = gradient; + canvasCtx.fillRect(x, canvas.height - barHeight, barWidth, barHeight); + + x += barWidth + 1; + } + } + function startRecording() { if (!isRecording && sessionActive) { audioChunks = []; @@ -329,6 +612,7 @@ recordButton.classList.add('recording'); recordButton.textContent = 'Release to Stop'; status.textContent = 'Recording...'; + visualizerLabel.textContent = 'Recording...'; audioWave.classList.remove('hidden'); isRecording = true; @@ -350,6 +634,7 @@ recordButton.classList.remove('recording'); recordButton.textContent = 'Hold to Speak'; status.textContent = 'Processing speech...'; + visualizerLabel.textContent = 'Processing...'; audioWave.classList.add('hidden'); isRecording = false; } @@ -358,6 +643,7 @@ function processRecording() { if (audioChunks.length === 0) { status.textContent = 'No audio recorded'; + visualizerLabel.textContent = 'Listening...'; return; } @@ -380,11 +666,13 @@ } catch (e) { console.error('Error processing audio:', e); status.textContent = 'Error processing audio'; + visualizerLabel.textContent = 'Error'; } }; fileReader.onerror = () => { status.textContent = 'Error reading audio data'; + visualizerLabel.textContent = 'Error'; }; fileReader.readAsArrayBuffer(audioBlob); @@ -403,7 +691,7 @@ return float32Array; } - function addMessage(sender, text) { + function addMessage(sender, text, textOnly = false) { const containerDiv = document.createElement('div'); containerDiv.className = sender === 'user' ? 'message-container user-message-container' : 'message-container bot-message-container'; @@ -422,12 +710,39 @@ infoDiv.className = 'transcription-info'; infoDiv.textContent = 'Transcribed with Whisper'; containerDiv.appendChild(infoDiv); + } else if (textOnly) { + // Add indicator for text-only response + const textOnlyDiv = document.createElement('div'); + textOnlyDiv.className = 'text-only-indicator'; + textOnlyDiv.textContent = 'Text-only response (audio unavailable)'; + containerDiv.appendChild(textOnlyDiv); } conversation.appendChild(containerDiv); conversation.scrollTop = conversation.scrollHeight; } + function addStatusMessage(message) { + const statusDiv = document.createElement('div'); + statusDiv.className = 'status-message'; + statusDiv.textContent = message; + conversation.appendChild(statusDiv); + conversation.scrollTop = conversation.scrollHeight; + + // Auto-remove status messages after 10 seconds + setTimeout(() => { + if (conversation.contains(statusDiv)) { + statusDiv.style.opacity = '0'; + statusDiv.style.transition = 'opacity 0.5s'; + setTimeout(() => { + if (conversation.contains(statusDiv)) { + conversation.removeChild(statusDiv); + } + }, 500); + } + }, 10000); + } + function arrayBufferToBase64(buffer) { let binary = ''; const bytes = new Uint8Array(buffer); @@ -450,6 +765,34 @@ if (socket && socket.connected) { socket.disconnect(); } + + if (visualizerAnimationId) { + cancelAnimationFrame(visualizerAnimationId); + } + }); + + // Add a reload button for debugging + const reloadButton = document.createElement('button'); + reloadButton.textContent = '🔄 Reload'; + reloadButton.style.position = 'fixed'; + reloadButton.style.bottom = '10px'; + reloadButton.style.right = '10px'; + reloadButton.style.padding = '5px 10px'; + reloadButton.style.fontSize = '12px'; + reloadButton.style.backgroundColor = '#f5f5f5'; + reloadButton.style.border = '1px solid #ddd'; + reloadButton.style.borderRadius = '4px'; + reloadButton.style.cursor = 'pointer'; + + reloadButton.addEventListener('click', () => { + window.location.reload(); + }); + + document.body.appendChild(reloadButton); + + // Handle window resize to update canvas size + window.addEventListener('resize', () => { + setupCanvas(); }); diff --git a/Backend/server.py b/Backend/server.py index 1754a1b..85bf97b 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -12,6 +12,10 @@ from collections import deque import requests import huggingface_hub from generator import load_csm_1b, Segment +import threading +import queue +from flask import stream_with_context, Response +import time # Configure environment with longer timeouts os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads @@ -124,6 +128,8 @@ def load_models(): # Store conversation context conversation_context = {} # session_id -> context +CHUNK_SIZE = 24000 # Number of audio samples per chunk (1 second at 24kHz) +audio_stream_queues = {} # session_id -> queue for audio chunks @app.route('/') def index(): @@ -144,8 +150,14 @@ def handle_connect(): @socketio.on('disconnect') def handle_disconnect(): print(f"Client disconnected: {request.sid}") - if request.sid in conversation_context: - del conversation_context[request.sid] + session_id = request.sid + + # Clean up resources + if session_id in conversation_context: + del conversation_context[session_id] + + if session_id in audio_stream_queues: + del audio_stream_queues[session_id] @socketio.on('start_speaking') def handle_start_speaking(): @@ -191,7 +203,7 @@ def is_silence(audio_tensor, threshold=0.02): return torch.mean(torch.abs(audio_tensor)) < threshold def process_user_utterance(session_id): - """Process completed user utterance, generate response and send audio back""" + """Process completed user utterance, generate response and stream audio back""" context = conversation_context[session_id] if not context['audio_buffer']: @@ -234,37 +246,32 @@ def process_user_utterance(session_id): ) context['segments'].append(user_segment) - # Generate bot response + # Generate bot response text bot_response = generate_llm_response(user_text, context['segments']) print(f"Bot response: {bot_response}") # Send transcribed text to client emit('transcription', {'text': user_text}, room=session_id) - # Generate and send audio response if CSM is available + # Generate and stream audio response if CSM is available if csm_generator is not None: - # Convert to audio using CSM - bot_audio = generate_audio_response(bot_response, context['segments']) + # Set up streaming queue for this session + if session_id not in audio_stream_queues: + audio_stream_queues[session_id] = queue.Queue() + else: + # Clear any existing items in the queue + while not audio_stream_queues[session_id].empty(): + audio_stream_queues[session_id].get() - # Convert audio to base64 for sending over websocket - audio_bytes = io.BytesIO() - torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav") - audio_bytes.seek(0) - audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') + # Start audio generation in a separate thread to not block the server + threading.Thread( + target=generate_and_stream_audio, + args=(bot_response, context['segments'], session_id), + daemon=True + ).start() - # Add bot response to conversation history - bot_segment = Segment( - text=bot_response, - speaker=1, # Bot is speaker 1 - audio=bot_audio - ) - context['segments'].append(bot_segment) - - # Send audio response to client - emit('audio_response', { - 'audio': audio_b64, - 'text': bot_response - }, room=session_id) + # Initial response with text + emit('start_streaming_response', {'text': bot_response}, room=session_id) else: # Send text-only response if audio generation isn't available emit('text_response', {'text': bot_response}, room=session_id) @@ -391,6 +398,98 @@ def generate_audio_response(text, conversation_segments): # Return silence as fallback return torch.zeros(csm_generator.sample_rate * 3) # 3 seconds of silence +def generate_and_stream_audio(text, conversation_segments, session_id): + """Generate audio response using CSM and stream it in chunks""" + try: + # Use the last few conversation segments as context + context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments + + # Generate full audio for bot response + audio = csm_generator.generate( + text=text, + speaker=1, # Bot is speaker 1 + context=context_segments, + max_audio_length_ms=10000, # 10 seconds max + temperature=0.9, + topk=50 + ) + + # Store the full audio for conversation history + bot_segment = Segment( + text=text, + speaker=1, # Bot is speaker 1 + audio=audio + ) + if session_id in conversation_context: + conversation_context[session_id]['segments'].append(bot_segment) + + # Split audio into chunks for streaming + chunk_size = CHUNK_SIZE + for i in range(0, len(audio), chunk_size): + chunk = audio[i:i+chunk_size] + + # Convert audio chunk to base64 for streaming + audio_bytes = io.BytesIO() + torchaudio.save(audio_bytes, chunk.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav") + audio_bytes.seek(0) + audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') + + # Send the chunk to the client + if session_id in audio_stream_queues: + audio_stream_queues[session_id].put({ + 'audio': audio_b64, + 'is_last': i + chunk_size >= len(audio) + }) + else: + # Session was disconnected before we finished generating + break + + # Signal the end of streaming if queue still exists + if session_id in audio_stream_queues: + # Add an empty chunk as a sentinel to signal end of streaming + audio_stream_queues[session_id].put(None) + + except Exception as e: + print(f"Error generating or streaming audio: {e}") + # Send error message to client + if session_id in conversation_context: + socketio.emit('error', { + 'message': f'Error generating audio: {str(e)}' + }, room=session_id) + + # Send a final message to unblock the client + if session_id in audio_stream_queues: + audio_stream_queues[session_id].put(None) + +@socketio.on('request_audio_chunk') +def handle_request_audio_chunk(): + """Send the next audio chunk in the queue to the client""" + session_id = request.sid + + if session_id not in audio_stream_queues: + emit('error', {'message': 'No audio stream available'}) + return + + # Get the next chunk or wait for it to be available + try: + if not audio_stream_queues[session_id].empty(): + chunk = audio_stream_queues[session_id].get(block=False) + + # If chunk is None, we're done streaming + if chunk is None: + emit('end_streaming') + # Clean up the queue + if session_id in audio_stream_queues: + del audio_stream_queues[session_id] + else: + emit('audio_chunk', chunk) + else: + # If the queue is empty but we're still generating, tell client to wait + emit('wait_for_chunk') + except Exception as e: + print(f"Error sending audio chunk: {e}") + emit('error', {'message': f'Error streaming audio: {str(e)}'}) + if __name__ == '__main__': # Ensure the existing index.html file is in the correct location if not os.path.exists('templates'):