From 27c5151e62dd4d4353f5c3d8ef2fcc943ba309eb Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 01:00:03 -0400 Subject: [PATCH] Demo Update 9 --- Backend/server.py | 38 ++++--- Backend/voice-chat.js | 231 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 250 insertions(+), 19 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index cfcc6ea..a9cf610 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -308,8 +308,8 @@ def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str: temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav") torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) - # Perform speech recognition - result = speech_recognizer(temp_path) + # Perform speech recognition - using input_features instead of inputs + result = speech_recognizer(temp_path, input_features=None) # input_features=None forces use of the correct parameter name transcription = result["text"] # Clean up temp file @@ -650,7 +650,7 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals # Combine audio chunks full_audio = torch.cat(client['streaming_buffer'], dim=0) - # Process audio to generate a response (no speech recognition) + # Process audio to generate a response (using speech recognition) generated_text = process_speech(full_audio, client_id) # Add suffix for incomplete utterances @@ -706,16 +706,28 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals ) client['context_segments'].append(ai_segment) - # Convert audio to base64 and send back to client - audio_base64 = encode_audio_data(audio_tensor) - emit('audio_response', { - 'type': 'audio_response', - 'text': response_text, - 'audio': audio_base64 - }, room=client_id) - - logger.info(f"[{client_id[:8]}] Audio response sent") - + # CHANGE HERE: Use the streaming function instead of sending all at once + # Check if the audio is short enough to send at once or if it should be streamed + if audio_tensor.size(0) < generator.sample_rate * 2: # Less than 2 seconds + # For short responses, just send in one go for better responsiveness + audio_base64 = encode_audio_data(audio_tensor) + emit('audio_response', { + 'type': 'audio_response', + 'text': response_text, + 'audio': audio_base64 + }, room=client_id) + logger.info(f"[{client_id[:8]}] Short audio response sent in one piece") + else: + # For longer responses, use streaming + logger.info(f"[{client_id[:8]}] Using streaming for audio response") + # Start a new thread for streaming to avoid blocking the main thread + import threading + stream_thread = threading.Thread( + target=stream_audio_to_client, + args=(client_id, audio_tensor, response_text, ai_speaker_id) + ) + stream_thread.start() + except Exception as e: logger.error(f"Error generating audio response: {e}") emit('error', { diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js index b224b27..89ec71a 100644 --- a/Backend/voice-chat.js +++ b/Backend/voice-chat.js @@ -50,6 +50,20 @@ let canvasContext = null; let visualizerBufferLength = 0; let visualizerDataArray = null; +// New state variables to track incremental audio streaming +const streamingAudio = { + messageElement: null, + audioElement: null, + chunks: [], + totalChunks: 0, + receivedChunks: 0, + text: '', + mediaSource: null, + sourceBuffer: null, + audioContext: null, + complete: false +}; + // Initialize the application function initializeApp() { // Initialize the UI elements @@ -116,6 +130,12 @@ function setupSocketConnection() { state.socket.on('transcription', handleTranscription); state.socket.on('context_updated', handleContextUpdate); state.socket.on('streaming_status', handleStreamingStatus); + + // New event handlers for incremental audio streaming + state.socket.on('audio_response_start', handleAudioResponseStart); + state.socket.on('audio_response_chunk', handleAudioResponseChunk); + state.socket.on('audio_response_complete', handleAudioResponseComplete); + state.socket.on('processing_status', handleProcessingStatus); } // Setup event listeners @@ -294,12 +314,8 @@ function stopStreaming(notifyServer = true) { function handleAudioProcess(event) { const inputData = event.inputBuffer.getChannelData(0); - // Log audio buffer statistics - console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`); - // Calculate audio energy (volume level) const energy = calculateAudioEnergy(inputData); - console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`); // Update energy window for averaging updateEnergyWindow(energy); @@ -309,7 +325,11 @@ function handleAudioProcess(event) { // Determine if audio is silent const isSilent = avgEnergy < state.silenceThreshold; - console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`); + + // Debug logging only if significant changes in audio patterns + if (Math.random() < 0.05) { // Log only 5% of frames to avoid console spam + console.log(`Audio: len=${inputData.length}, energy=${energy.toFixed(4)}, avg=${avgEnergy.toFixed(4)}, silent=${isSilent}`); + } // Handle speech state based on silence handleSpeechState(isSilent); @@ -319,7 +339,6 @@ function handleAudioProcess(event) { // Create a resampled version at 24kHz for the server // Most WebRTC audio is 48kHz, but we want 24kHz for the model const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000); - console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`); // Send the audio chunk to the server sendAudioChunk(resampledData, state.currentSpeaker); @@ -847,6 +866,206 @@ function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) { return result; } +// Handle processing status updates +function handleProcessingStatus(data) { + console.log('Processing status update:', data); + + // Show processing status in UI + if (data.status === 'generating_audio') { + elements.streamButton.innerHTML = ' Processing...'; + elements.streamButton.classList.add('processing'); + elements.streamButton.classList.remove('recording'); + + // Show message to user + addSystemMessage(data.message || 'Processing your request...'); + } +} + +// Handle the start of an audio streaming response +function handleAudioResponseStart(data) { + console.log('Audio response starting:', data); + + // Reset streaming audio state + streamingAudio.chunks = []; + streamingAudio.totalChunks = data.total_chunks; + streamingAudio.receivedChunks = 0; + streamingAudio.text = data.text; + streamingAudio.complete = false; + + // Create message container now, so we can update it as chunks arrive + const messageElement = document.createElement('div'); + messageElement.className = 'message ai processing'; + + // Add text content if available + if (data.text) { + const textElement = document.createElement('p'); + textElement.textContent = data.text; + messageElement.appendChild(textElement); + } + + // Create audio element (will be populated as chunks arrive) + const audioElement = document.createElement('audio'); + audioElement.controls = true; + audioElement.className = 'audio-player'; + audioElement.textContent = 'Audio is being generated...'; + messageElement.appendChild(audioElement); + + // Add timestamp + const timeElement = document.createElement('span'); + timeElement.className = 'message-time'; + timeElement.textContent = new Date().toLocaleTimeString(); + messageElement.appendChild(timeElement); + + // Add loading indicator + const loadingElement = document.createElement('div'); + loadingElement.className = 'loading-indicator'; + loadingElement.innerHTML = '
Generating audio response...'; + messageElement.appendChild(loadingElement); + + // Add to conversation + elements.conversation.appendChild(messageElement); + + // Auto-scroll to bottom + elements.conversation.scrollTop = elements.conversation.scrollHeight; + + // Store elements for later updates + streamingAudio.messageElement = messageElement; + streamingAudio.audioElement = audioElement; +} + +// Handle an incoming audio chunk +function handleAudioResponseChunk(data) { + console.log(`Received audio chunk ${data.chunk_index + 1}/${data.total_chunks}`); + + // Store the chunk + streamingAudio.chunks[data.chunk_index] = data.audio; + streamingAudio.receivedChunks++; + + // Update progress in the UI + if (streamingAudio.messageElement) { + const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator span'); + if (loadingElement) { + loadingElement.textContent = `Generating audio response... ${Math.round((streamingAudio.receivedChunks / data.total_chunks) * 100)}%`; + } + } + + // If this is the first chunk, start playing it immediately for faster response + if (data.chunk_index === 0 && streamingAudio.audioElement && elements.autoPlayResponses && elements.autoPlayResponses.checked) { + try { + streamingAudio.audioElement.src = data.audio; + streamingAudio.audioElement.play().catch(err => console.warn('Auto-play failed:', err)); + } catch (e) { + console.error('Error playing first chunk:', e); + } + } + + // If this is the last chunk or we've received all chunks, finalize the audio + if (data.is_last || streamingAudio.receivedChunks >= data.total_chunks) { + finalizeStreamingAudio(); + } +} + +// Handle completion of audio streaming +function handleAudioResponseComplete(data) { + console.log('Audio response complete:', data); + streamingAudio.complete = true; + + // Make sure we finalize the audio even if some chunks were missed + finalizeStreamingAudio(); + + // Update UI to normal state + if (state.isStreaming) { + elements.streamButton.innerHTML = ' Listening...'; + elements.streamButton.classList.add('recording'); + elements.streamButton.classList.remove('processing'); + } +} + +// Finalize streaming audio by combining chunks and updating the UI +function finalizeStreamingAudio() { + if (!streamingAudio.messageElement || streamingAudio.chunks.length === 0) { + return; + } + + try { + // For more sophisticated audio streaming, you would need to properly concatenate + // the WAV files, but for now we'll use the last chunk as the complete audio + // since it should contain the entire response due to how the server is implementing it + const lastChunkIndex = streamingAudio.chunks.length - 1; + const audioData = streamingAudio.chunks[lastChunkIndex] || streamingAudio.chunks[0]; + + // Update the audio element with the complete audio + if (streamingAudio.audioElement) { + streamingAudio.audioElement.src = audioData; + + // Auto-play if enabled and not already playing + if (elements.autoPlayResponses && elements.autoPlayResponses.checked && + streamingAudio.audioElement.paused) { + streamingAudio.audioElement.play() + .catch(err => { + console.warn('Auto-play failed:', err); + addSystemMessage('Auto-play failed. Please click play to hear the response.'); + }); + } + } + + // Remove loading indicator and processing class + if (streamingAudio.messageElement) { + const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator'); + if (loadingElement) { + streamingAudio.messageElement.removeChild(loadingElement); + } + streamingAudio.messageElement.classList.remove('processing'); + } + + console.log('Audio response finalized and ready for playback'); + } catch (e) { + console.error('Error finalizing streaming audio:', e); + } + + // Reset streaming audio state + streamingAudio.chunks = []; + streamingAudio.totalChunks = 0; + streamingAudio.receivedChunks = 0; + streamingAudio.messageElement = null; + streamingAudio.audioElement = null; +} + +// Add CSS styles for new UI elements +document.addEventListener('DOMContentLoaded', function() { + // Add styles for processing state + const style = document.createElement('style'); + style.textContent = ` + .message.processing { + opacity: 0.8; + } + + .loading-indicator { + display: flex; + align-items: center; + margin-top: 8px; + font-size: 0.9em; + color: #666; + } + + .loading-spinner { + width: 16px; + height: 16px; + border: 2px solid #ddd; + border-top: 2px solid var(--primary-color); + border-radius: 50%; + margin-right: 8px; + animation: spin 1s linear infinite; + } + + @keyframes spin { + 0% { transform: rotate(0deg); } + 100% { transform: rotate(360deg); } + } + `; + document.head.appendChild(style); +}); + // Initialize the application when DOM is fully loaded document.addEventListener('DOMContentLoaded', initializeApp);