From 27c5151e62dd4d4353f5c3d8ef2fcc943ba309eb Mon Sep 17 00:00:00 2001
From: GamerBoss101 <adithcool.5@gmail.com>
Date: Sun, 30 Mar 2025 01:00:03 -0400
Subject: [PATCH] Demo Update 9

---
 Backend/server.py     |  38 ++++---
 Backend/voice-chat.js | 231 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 250 insertions(+), 19 deletions(-)

diff --git a/Backend/server.py b/Backend/server.py
index cfcc6ea..a9cf610 100644
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -308,8 +308,8 @@ def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str:
         temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav")
         torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
         
-        # Perform speech recognition
-        result = speech_recognizer(temp_path)
+        # Perform speech recognition - using input_features instead of inputs
+        result = speech_recognizer(temp_path, input_features=None)  # input_features=None forces use of the correct parameter name
         transcription = result["text"]
         
         # Clean up temp file
@@ -650,7 +650,7 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
         # Combine audio chunks
         full_audio = torch.cat(client['streaming_buffer'], dim=0)
         
-        # Process audio to generate a response (no speech recognition)
+        # Process audio to generate a response (using speech recognition)
         generated_text = process_speech(full_audio, client_id)
         
         # Add suffix for incomplete utterances
@@ -706,16 +706,28 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
                     )
                     client['context_segments'].append(ai_segment)
                     
-                    # Convert audio to base64 and send back to client
-                    audio_base64 = encode_audio_data(audio_tensor)
-                    emit('audio_response', {
-                        'type': 'audio_response',
-                        'text': response_text,
-                        'audio': audio_base64
-                    }, room=client_id)
-                    
-                    logger.info(f"[{client_id[:8]}] Audio response sent")
-                    
+                    # CHANGE HERE: Use the streaming function instead of sending all at once
+                    # Check if the audio is short enough to send at once or if it should be streamed
+                    if audio_tensor.size(0) < generator.sample_rate * 2:  # Less than 2 seconds
+                        # For short responses, just send in one go for better responsiveness
+                        audio_base64 = encode_audio_data(audio_tensor)
+                        emit('audio_response', {
+                            'type': 'audio_response',
+                            'text': response_text,
+                            'audio': audio_base64
+                        }, room=client_id)
+                        logger.info(f"[{client_id[:8]}] Short audio response sent in one piece")
+                    else:
+                        # For longer responses, use streaming
+                        logger.info(f"[{client_id[:8]}] Using streaming for audio response")
+                        # Start a new thread for streaming to avoid blocking the main thread
+                        import threading
+                        stream_thread = threading.Thread(
+                            target=stream_audio_to_client,
+                            args=(client_id, audio_tensor, response_text, ai_speaker_id)
+                        )
+                        stream_thread.start()
+                        
                 except Exception as e:
                     logger.error(f"Error generating audio response: {e}")
                     emit('error', {
diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js
index b224b27..89ec71a 100644
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -50,6 +50,20 @@ let canvasContext = null;
 let visualizerBufferLength = 0;
 let visualizerDataArray = null;
 
+// New state variables to track incremental audio streaming
+const streamingAudio = {
+    messageElement: null,
+    audioElement: null,
+    chunks: [],
+    totalChunks: 0,
+    receivedChunks: 0,
+    text: '',
+    mediaSource: null,
+    sourceBuffer: null,
+    audioContext: null,
+    complete: false
+};
+
 // Initialize the application
 function initializeApp() {
     // Initialize the UI elements
@@ -116,6 +130,12 @@ function setupSocketConnection() {
     state.socket.on('transcription', handleTranscription);
     state.socket.on('context_updated', handleContextUpdate);
     state.socket.on('streaming_status', handleStreamingStatus);
+    
+    // New event handlers for incremental audio streaming
+    state.socket.on('audio_response_start', handleAudioResponseStart);
+    state.socket.on('audio_response_chunk', handleAudioResponseChunk);
+    state.socket.on('audio_response_complete', handleAudioResponseComplete);
+    state.socket.on('processing_status', handleProcessingStatus);
 }
 
 // Setup event listeners
@@ -294,12 +314,8 @@ function stopStreaming(notifyServer = true) {
 function handleAudioProcess(event) {
     const inputData = event.inputBuffer.getChannelData(0);
     
-    // Log audio buffer statistics
-    console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`);
-    
     // Calculate audio energy (volume level)
     const energy = calculateAudioEnergy(inputData);
-    console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`);
     
     // Update energy window for averaging
     updateEnergyWindow(energy);
@@ -309,7 +325,11 @@ function handleAudioProcess(event) {
     
     // Determine if audio is silent
     const isSilent = avgEnergy < state.silenceThreshold;
-    console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`);
+    
+    // Debug logging only if significant changes in audio patterns
+    if (Math.random() < 0.05) { // Log only 5% of frames to avoid console spam
+        console.log(`Audio: len=${inputData.length}, energy=${energy.toFixed(4)}, avg=${avgEnergy.toFixed(4)}, silent=${isSilent}`);
+    }
     
     // Handle speech state based on silence
     handleSpeechState(isSilent);
@@ -319,7 +339,6 @@ function handleAudioProcess(event) {
         // Create a resampled version at 24kHz for the server
         // Most WebRTC audio is 48kHz, but we want 24kHz for the model
         const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
-        console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`);
         
         // Send the audio chunk to the server
         sendAudioChunk(resampledData, state.currentSpeaker);
@@ -847,6 +866,206 @@ function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
     return result;
 }
 
+// Handle processing status updates
+function handleProcessingStatus(data) {
+    console.log('Processing status update:', data);
+    
+    // Show processing status in UI
+    if (data.status === 'generating_audio') {
+        elements.streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
+        elements.streamButton.classList.add('processing');
+        elements.streamButton.classList.remove('recording');
+        
+        // Show message to user
+        addSystemMessage(data.message || 'Processing your request...');
+    }
+}
+
+// Handle the start of an audio streaming response
+function handleAudioResponseStart(data) {
+    console.log('Audio response starting:', data);
+    
+    // Reset streaming audio state
+    streamingAudio.chunks = [];
+    streamingAudio.totalChunks = data.total_chunks;
+    streamingAudio.receivedChunks = 0;
+    streamingAudio.text = data.text;
+    streamingAudio.complete = false;
+    
+    // Create message container now, so we can update it as chunks arrive
+    const messageElement = document.createElement('div');
+    messageElement.className = 'message ai processing';
+    
+    // Add text content if available
+    if (data.text) {
+        const textElement = document.createElement('p');
+        textElement.textContent = data.text;
+        messageElement.appendChild(textElement);
+    }
+    
+    // Create audio element (will be populated as chunks arrive)
+    const audioElement = document.createElement('audio');
+    audioElement.controls = true;
+    audioElement.className = 'audio-player';
+    audioElement.textContent = 'Audio is being generated...';
+    messageElement.appendChild(audioElement);
+    
+    // Add timestamp
+    const timeElement = document.createElement('span');
+    timeElement.className = 'message-time';
+    timeElement.textContent = new Date().toLocaleTimeString();
+    messageElement.appendChild(timeElement);
+    
+    // Add loading indicator
+    const loadingElement = document.createElement('div');
+    loadingElement.className = 'loading-indicator';
+    loadingElement.innerHTML = '<div class="loading-spinner"></div><span>Generating audio response...</span>';
+    messageElement.appendChild(loadingElement);
+    
+    // Add to conversation
+    elements.conversation.appendChild(messageElement);
+    
+    // Auto-scroll to bottom
+    elements.conversation.scrollTop = elements.conversation.scrollHeight;
+    
+    // Store elements for later updates
+    streamingAudio.messageElement = messageElement;
+    streamingAudio.audioElement = audioElement;
+}
+
+// Handle an incoming audio chunk
+function handleAudioResponseChunk(data) {
+    console.log(`Received audio chunk ${data.chunk_index + 1}/${data.total_chunks}`);
+    
+    // Store the chunk
+    streamingAudio.chunks[data.chunk_index] = data.audio;
+    streamingAudio.receivedChunks++;
+    
+    // Update progress in the UI
+    if (streamingAudio.messageElement) {
+        const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator span');
+        if (loadingElement) {
+            loadingElement.textContent = `Generating audio response... ${Math.round((streamingAudio.receivedChunks / data.total_chunks) * 100)}%`;
+        }
+    }
+    
+    // If this is the first chunk, start playing it immediately for faster response
+    if (data.chunk_index === 0 && streamingAudio.audioElement && elements.autoPlayResponses && elements.autoPlayResponses.checked) {
+        try {
+            streamingAudio.audioElement.src = data.audio;
+            streamingAudio.audioElement.play().catch(err => console.warn('Auto-play failed:', err));
+        } catch (e) {
+            console.error('Error playing first chunk:', e);
+        }
+    }
+    
+    // If this is the last chunk or we've received all chunks, finalize the audio
+    if (data.is_last || streamingAudio.receivedChunks >= data.total_chunks) {
+        finalizeStreamingAudio();
+    }
+}
+
+// Handle completion of audio streaming
+function handleAudioResponseComplete(data) {
+    console.log('Audio response complete:', data);
+    streamingAudio.complete = true;
+    
+    // Make sure we finalize the audio even if some chunks were missed
+    finalizeStreamingAudio();
+    
+    // Update UI to normal state
+    if (state.isStreaming) {
+        elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
+        elements.streamButton.classList.add('recording');
+        elements.streamButton.classList.remove('processing');
+    }
+}
+
+// Finalize streaming audio by combining chunks and updating the UI
+function finalizeStreamingAudio() {
+    if (!streamingAudio.messageElement || streamingAudio.chunks.length === 0) {
+        return;
+    }
+    
+    try {
+        // For more sophisticated audio streaming, you would need to properly concatenate
+        // the WAV files, but for now we'll use the last chunk as the complete audio
+        // since it should contain the entire response due to how the server is implementing it
+        const lastChunkIndex = streamingAudio.chunks.length - 1;
+        const audioData = streamingAudio.chunks[lastChunkIndex] || streamingAudio.chunks[0];
+        
+        // Update the audio element with the complete audio
+        if (streamingAudio.audioElement) {
+            streamingAudio.audioElement.src = audioData;
+            
+            // Auto-play if enabled and not already playing
+            if (elements.autoPlayResponses && elements.autoPlayResponses.checked && 
+                streamingAudio.audioElement.paused) {
+                streamingAudio.audioElement.play()
+                    .catch(err => {
+                        console.warn('Auto-play failed:', err);
+                        addSystemMessage('Auto-play failed. Please click play to hear the response.');
+                    });
+            }
+        }
+        
+        // Remove loading indicator and processing class
+        if (streamingAudio.messageElement) {
+            const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator');
+            if (loadingElement) {
+                streamingAudio.messageElement.removeChild(loadingElement);
+            }
+            streamingAudio.messageElement.classList.remove('processing');
+        }
+        
+        console.log('Audio response finalized and ready for playback');
+    } catch (e) {
+        console.error('Error finalizing streaming audio:', e);
+    }
+    
+    // Reset streaming audio state
+    streamingAudio.chunks = [];
+    streamingAudio.totalChunks = 0;
+    streamingAudio.receivedChunks = 0;
+    streamingAudio.messageElement = null;
+    streamingAudio.audioElement = null;
+}
+
+// Add CSS styles for new UI elements
+document.addEventListener('DOMContentLoaded', function() {
+    // Add styles for processing state
+    const style = document.createElement('style');
+    style.textContent = `
+        .message.processing {
+            opacity: 0.8;
+        }
+        
+        .loading-indicator {
+            display: flex;
+            align-items: center;
+            margin-top: 8px;
+            font-size: 0.9em;
+            color: #666;
+        }
+        
+        .loading-spinner {
+            width: 16px;
+            height: 16px;
+            border: 2px solid #ddd;
+            border-top: 2px solid var(--primary-color);
+            border-radius: 50%;
+            margin-right: 8px;
+            animation: spin 1s linear infinite;
+        }
+        
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+    `;
+    document.head.appendChild(style);
+});
+
 // Initialize the application when DOM is fully loaded
 document.addEventListener('DOMContentLoaded', initializeApp);