From b74ae2dbfc449913e669e2c54e76e973ad63eb6f Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sat, 29 Mar 2025 23:43:16 -0400 Subject: [PATCH] Demo Update 3 --- Backend/server.py | 62 ++++++++-- Backend/voice-chat.js | 275 +++++++++++++++++++++--------------------- 2 files changed, 188 insertions(+), 149 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index 4e60aa7..bacf793 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -55,27 +55,71 @@ active_clients = {} # Map client_id to client context def decode_audio_data(audio_data: str) -> torch.Tensor: """Decode base64 audio data to a torch tensor""" try: + # Skip empty audio data + if not audio_data: + print("Empty audio data received") + return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence + # Extract the actual base64 content if ',' in audio_data: audio_data = audio_data.split(',')[1] - + # Decode base64 audio data - binary_data = base64.b64decode(audio_data) + try: + binary_data = base64.b64decode(audio_data) + print(f"Decoded base64 data: {len(binary_data)} bytes") + except Exception as e: + print(f"Base64 decoding error: {str(e)}") + return torch.zeros(generator.sample_rate // 2) + # Debug: save the raw binary data to examine with external tools + debug_path = os.path.join(base_dir, "debug_incoming.wav") + with open(debug_path, 'wb') as f: + f.write(binary_data) + print(f"Saved debug file to {debug_path}") + # Load audio from binary data - with BytesIO(binary_data) as temp_file: - audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") + try: + with BytesIO(binary_data) as temp_file: + audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") + print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz") + + # Check if audio is valid + if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any(): + print("Warning: Empty or invalid audio data detected") + return torch.zeros(generator.sample_rate // 2) + except Exception as e: + print(f"Audio loading error: {str(e)}") + # Try saving to a temporary file instead of loading from BytesIO + try: + temp_path = os.path.join(base_dir, "temp_incoming.wav") + with open(temp_path, 'wb') as f: + f.write(binary_data) + print(f"Trying to load from file: {temp_path}") + audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav") + print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz") + os.remove(temp_path) + except Exception as e2: + print(f"Secondary audio loading error: {str(e2)}") + return torch.zeros(generator.sample_rate // 2) # Resample if needed if sample_rate != generator.sample_rate: - audio_tensor = torchaudio.functional.resample( - audio_tensor.squeeze(0), - orig_freq=sample_rate, - new_freq=generator.sample_rate - ) + try: + print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz") + audio_tensor = torchaudio.functional.resample( + audio_tensor.squeeze(0), + orig_freq=sample_rate, + new_freq=generator.sample_rate + ) + print(f"Resampled audio shape: {audio_tensor.shape}") + except Exception as e: + print(f"Resampling error: {str(e)}") + return torch.zeros(generator.sample_rate // 2) else: audio_tensor = audio_tensor.squeeze(0) + print(f"Final audio tensor shape: {audio_tensor.shape}") return audio_tensor except Exception as e: print(f"Error decoding audio: {str(e)}") diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js index a4e10f5..c85da8a 100644 --- a/Backend/voice-chat.js +++ b/Backend/voice-chat.js @@ -70,88 +70,18 @@ function initializeApp() { // Initialize UI elements function initializeUIElements() { - // Main UI containers - const chatContainer = document.querySelector('.chat-container'); - const controlPanel = document.querySelector('.control-panel'); - - // Create conversation section - chatContainer.innerHTML = ` -
-

Conversation

-
-
- Disconnected -
-
-
- `; - - // Create control panel - controlPanel.innerHTML = ` -
-
- -
Speak to see audio visualization
-
-
- -
-
-
Voice Controls
- -
-
-
- -
-
- Silence Threshold - 0.01 -
- -
- - - -
- - -
-
- -
-
Settings
- -
-
- - -
- -
- - -
-
-
-
- `; - // Store references to UI elements - elements.conversation = document.querySelector('.conversation'); + elements.conversation = document.getElementById('conversation'); elements.streamButton = document.getElementById('streamButton'); elements.clearButton = document.getElementById('clearButton'); elements.thresholdSlider = document.getElementById('thresholdSlider'); elements.thresholdValue = document.getElementById('thresholdValue'); elements.visualizerCanvas = document.getElementById('audioVisualizer'); - elements.visualizerLabel = document.querySelector('.visualizer-label'); - elements.volumeLevel = document.querySelector('.volume-level'); - elements.statusDot = document.querySelector('.status-dot'); - elements.statusText = document.querySelector('.status-text'); - elements.speakerSelection = document.getElementById('speakerSelection'); + elements.visualizerLabel = document.getElementById('visualizerLabel'); + elements.volumeLevel = document.getElementById('volumeLevel'); + elements.statusDot = document.getElementById('statusDot'); + elements.statusText = document.getElementById('statusText'); + elements.speakerSelection = document.getElementById('speakerSelect'); // Changed to match HTML elements.autoPlayResponses = document.getElementById('autoPlayResponses'); elements.showVisualizer = document.getElementById('showVisualizer'); } @@ -364,8 +294,12 @@ function stopStreaming(notifyServer = true) { function handleAudioProcess(event) { const inputData = event.inputBuffer.getChannelData(0); + // Log audio buffer statistics + console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`); + // Calculate audio energy (volume level) const energy = calculateAudioEnergy(inputData); + console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`); // Update energy window for averaging updateEnergyWindow(energy); @@ -375,6 +309,7 @@ function handleAudioProcess(event) { // Determine if audio is silent const isSilent = avgEnergy < state.silenceThreshold; + console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`); // Handle speech state based on silence handleSpeechState(isSilent); @@ -384,6 +319,7 @@ function handleAudioProcess(event) { // Create a resampled version at 24kHz for the server // Most WebRTC audio is 48kHz, but we want 24kHz for the model const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000); + console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`); // Send the audio chunk to the server sendAudioChunk(resampledData, state.currentSpeaker); @@ -530,20 +466,132 @@ function sendAudioChunk(audioData, speaker) { return; } - const wavData = createWavBlob(audioData, 24000); - const reader = new FileReader(); + console.log(`Creating WAV from audio data: length=${audioData.length}`); - reader.onloadend = function() { - const base64data = reader.result; + // Check for NaN or invalid values + let hasNaN = false; + let min = Infinity; + let max = -Infinity; + let sum = 0; + + for (let i = 0; i < audioData.length; i++) { + if (isNaN(audioData[i]) || !isFinite(audioData[i])) { + hasNaN = true; + console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`); + break; + } + min = Math.min(min, audioData[i]); + max = Math.max(max, audioData[i]); + sum += audioData[i]; + } + + if (hasNaN) { + console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.'); + audioData = new Float32Array(audioData.length).fill(0); + } else { + const avg = sum / audioData.length; + console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`); + } + + try { + // Create WAV blob with proper format + const wavData = createWavBlob(audioData, 24000); + console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`); - // Send the audio chunk to the server - state.socket.emit('stream_audio', { - audio: base64data, - speaker: speaker - }); - }; + const reader = new FileReader(); + + reader.onloadend = function() { + try { + // Get base64 data + const base64data = reader.result; + console.log(`Base64 data created: length=${base64data.length}`); + + // Validate the base64 data before sending + if (!base64data || base64data.length < 100) { + console.warn('Generated base64 data is too small or invalid'); + return; + } + + // Send the audio chunk to the server + console.log('Sending audio data to server...'); + state.socket.emit('stream_audio', { + audio: base64data, + speaker: speaker + }); + console.log('Audio data sent successfully'); + } catch (err) { + console.error('Error preparing audio data:', err); + } + }; + + reader.onerror = function(err) { + console.error('Error reading audio data:', err); + }; + + reader.readAsDataURL(wavData); + } catch (err) { + console.error('Error creating WAV data:', err); + } +} + +// Create WAV blob from audio data with validation +function createWavBlob(audioData, sampleRate) { + // Check if audio data is valid + if (!audioData || audioData.length === 0) { + console.warn('Empty audio data received'); + // Return a tiny silent audio snippet instead + audioData = new Float32Array(100).fill(0); + } - reader.readAsDataURL(wavData); + // Function to convert Float32Array to Int16Array for WAV format + function floatTo16BitPCM(output, offset, input) { + for (let i = 0; i < input.length; i++, offset += 2) { + const s = Math.max(-1, Math.min(1, input[i])); + output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); + } + } + + // Create WAV header + function writeString(view, offset, string) { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + } + + // Create WAV file with header + function encodeWAV(samples) { + const buffer = new ArrayBuffer(44 + samples.length * 2); + const view = new DataView(buffer); + + // RIFF chunk descriptor + writeString(view, 0, 'RIFF'); + view.setUint32(4, 36 + samples.length * 2, true); + writeString(view, 8, 'WAVE'); + + // fmt sub-chunk + writeString(view, 12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); // PCM format + view.setUint16(22, 1, true); // Mono channel + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); // Byte rate + view.setUint16(32, 2, true); // Block align + view.setUint16(34, 16, true); // Bits per sample + + // data sub-chunk + writeString(view, 36, 'data'); + view.setUint32(40, samples.length * 2, true); + floatTo16BitPCM(view, 44, samples); + + return buffer; + } + + // Convert audio data to TypedArray if it's a regular Array + const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData; + + // Create WAV blob + const wavBuffer = encodeWAV(samples); + return new Blob([wavBuffer], { type: 'audio/wav' }); } // Draw audio visualizer @@ -757,59 +805,6 @@ function addSystemMessage(message) { elements.conversation.scrollTop = elements.conversation.scrollHeight; } -// Create WAV blob from audio data -function createWavBlob(audioData, sampleRate) { - // Function to convert Float32Array to Int16Array for WAV format - function floatTo16BitPCM(output, offset, input) { - for (let i = 0; i < input.length; i++, offset += 2) { - const s = Math.max(-1, Math.min(1, input[i])); - output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); - } - } - - // Create WAV header - function writeString(view, offset, string) { - for (let i = 0; i < string.length; i++) { - view.setUint8(offset + i, string.charCodeAt(i)); - } - } - - // Create WAV file with header - function encodeWAV(samples) { - const buffer = new ArrayBuffer(44 + samples.length * 2); - const view = new DataView(buffer); - - // RIFF chunk descriptor - writeString(view, 0, 'RIFF'); - view.setUint32(4, 36 + samples.length * 2, true); - writeString(view, 8, 'WAVE'); - - // fmt sub-chunk - writeString(view, 12, 'fmt '); - view.setUint32(16, 16, true); - view.setUint16(20, 1, true); // PCM format - view.setUint16(22, 1, true); // Mono channel - view.setUint32(24, sampleRate, true); - view.setUint32(28, sampleRate * 2, true); // Byte rate - view.setUint16(32, 2, true); // Block align - view.setUint16(34, 16, true); // Bits per sample - - // data sub-chunk - writeString(view, 36, 'data'); - view.setUint32(40, samples.length * 2, true); - floatTo16BitPCM(view, 44, samples); - - return buffer; - } - - // Convert audio data to TypedArray if it's a regular Array - const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData; - - // Create WAV blob - const wavBuffer = encodeWAV(samples); - return new Blob([wavBuffer], { type: 'audio/wav' }); -} - // Downsample audio buffer to target sample rate function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) { if (originalSampleRate === targetSampleRate) {