Demo Update 3

2025-03-29 23:54:02 -04:00
parent b74ae2dbfc
commit eef7da454a
2 changed files with 320 additions and 112 deletions
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -16,6 +16,28 @@ import gc
 from collections import deque
 from threading import Lock
 # Add these lines right after your imports
 import torch
 import os
 # Handle CUDA issues
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Limit to first GPU only
 torch.backends.cudnn.benchmark = True
 # Set CUDA settings to avoid TF32 warnings
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 # Set compute type based on available hardware
 if torch.cuda.is_available():
    device = "cuda"
    compute_type = "float16"  # Faster for CUDA
 else:
    device = "cpu"
    compute_type = "int8"  # Better for CPU
 print(f"Using device: {device} with compute type: {compute_type}")
 # Select device
 if torch.cuda.is_available():
    device = "cuda"
@@ -28,9 +50,22 @@ generator = load_csm_1b(device=device)
 # Initialize WhisperX for ASR
 print("Loading WhisperX model...")
-# Use a smaller model for faster response times
+try:
-asr_model = whisperx.load_model("medium", device, compute_type="float16")
+    # Try to load a smaller model for faster response times
-print("WhisperX model loaded!")
+    asr_model = whisperx.load_model("small", device, compute_type=compute_type)
    print("WhisperX 'small' model loaded successfully")
 except Exception as e:
    print(f"Error loading 'small' model: {str(e)}")
    try:
        # Fall back to tiny model if small fails
        asr_model = whisperx.load_model("tiny", device, compute_type=compute_type)
        print("WhisperX 'tiny' model loaded as fallback")
    except Exception as e2:
        print(f"Error loading fallback model: {str(e2)}")
        print("Trying CPU model as last resort")
        # Last resort - try CPU
        asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
        print("WhisperX loaded on CPU as last resort")
 # Silence detection parameters
 SILENCE_THRESHOLD = 0.01  # Adjust based on your audio normalization
@@ -53,76 +88,130 @@ active_clients = {}  # Map client_id to client context
 # Helper function to convert audio data
 def decode_audio_data(audio_data: str) -> torch.Tensor:
-    """Decode base64 audio data to a torch tensor"""
+    """Decode base64 audio data to a torch tensor with improved error handling"""
    try:
        # Skip empty audio data
-        if not audio_data:
+        if not audio_data or len(audio_data) < 100:
-            print("Empty audio data received")
+            print("Empty or too short audio data received")
            return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence
        # Extract the actual base64 content
        if ',' in audio_data:
            # Handle data URL format (data:audio/wav;base64,...)
            audio_data = audio_data.split(',')[1]
        # Decode base64 audio data
        try:
            binary_data = base64.b64decode(audio_data)
            print(f"Decoded base64 data: {len(binary_data)} bytes")
            # Check if we have enough data for a valid WAV
            if len(binary_data) < 44:  # WAV header is 44 bytes
                print("Data too small to be a valid WAV file")
                return torch.zeros(generator.sample_rate // 2)
        except Exception as e:
            print(f"Base64 decoding error: {str(e)}")
            return torch.zeros(generator.sample_rate // 2)
-        # Debug: save the raw binary data to examine with external tools
+        # Save for debugging
        debug_path = os.path.join(base_dir, "debug_incoming.wav") 
        with open(debug_path, 'wb') as f:
            f.write(binary_data)
-        print(f"Saved debug file to {debug_path}")
+        print(f"Saved debug file: {debug_path}")
-        # Load audio from binary data
+        # Approach 1: Load directly with torchaudio
        try:
            with BytesIO(binary_data) as temp_file:
                temp_file.seek(0)  # Ensure we're at the start of the buffer
                audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
-                print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
+                print(f"Direct loading success: shape={audio_tensor.shape}, rate={sample_rate}Hz")
                # Check if audio is valid
                if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
-                    print("Warning: Empty or invalid audio data detected")
+                    raise ValueError("Empty or invalid audio tensor detected")
                    return torch.zeros(generator.sample_rate // 2)
        except Exception as e:
-            print(f"Audio loading error: {str(e)}")
+            print(f"Direct loading failed: {str(e)}")
-            # Try saving to a temporary file instead of loading from BytesIO
+            
            # Approach 2: Try to fix/normalize the WAV data
            try:
-                temp_path = os.path.join(base_dir, "temp_incoming.wav")
+                # Sometimes WAV headers can be malformed, attempt to fix
                temp_path = os.path.join(base_dir, "temp_fixing.wav")
                with open(temp_path, 'wb') as f:
                    f.write(binary_data)
-                print(f"Trying to load from file: {temp_path}")
+                
-                audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
+                # Use a simpler numpy approach as backup
-                print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
+                import numpy as np
-                os.remove(temp_path)
+                import wave
                try:
                    with wave.open(temp_path, 'rb') as wf:
                        n_channels = wf.getnchannels()
                        sample_width = wf.getsampwidth()
                        sample_rate = wf.getframerate()
                        n_frames = wf.getnframes()
                        # Read the frames
                        frames = wf.readframes(n_frames)
                        print(f"Wave reading: channels={n_channels}, rate={sample_rate}Hz, frames={n_frames}")
                        # Convert to numpy and then to torch
                        if sample_width == 2:  # 16-bit audio
                            data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
                        elif sample_width == 1:  # 8-bit audio
                            data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0
                        else:
                            raise ValueError(f"Unsupported sample width: {sample_width}")
                        # Convert to mono if needed
                        if n_channels > 1:
                            data = data.reshape(-1, n_channels)
                            data = data.mean(axis=1)
                        # Convert to torch tensor
                        audio_tensor = torch.from_numpy(data)
                        print(f"Successfully converted with numpy: shape={audio_tensor.shape}")
                except Exception as wave_error:
                    print(f"Wave processing failed: {str(wave_error)}")
                    # Try with torchaudio as last resort
                    audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
                # Clean up
                if os.path.exists(temp_path):
                    os.remove(temp_path)
            except Exception as e2:
-                print(f"Secondary audio loading error: {str(e2)}")
+                print(f"All WAV loading methods failed: {str(e2)}")
                print("Returning silence as fallback")
                return torch.zeros(generator.sample_rate // 2)
        # Ensure audio is the right shape (mono)
        if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1:
            audio_tensor = torch.mean(audio_tensor, dim=0)
        # Ensure we have a 1D tensor
        audio_tensor = audio_tensor.squeeze()
        # Resample if needed
        if sample_rate != generator.sample_rate:
            try:
                print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
-                audio_tensor = torchaudio.functional.resample(
+                resampler = torchaudio.transforms.Resample(
                    audio_tensor.squeeze(0), 
                    orig_freq=sample_rate, 
                    new_freq=generator.sample_rate
                )
-                print(f"Resampled audio shape: {audio_tensor.shape}")
+                audio_tensor = resampler(audio_tensor)
            except Exception as e:
                print(f"Resampling error: {str(e)}")
-                return torch.zeros(generator.sample_rate // 2)
+                # If resampling fails, just return the original audio
-        else:
+                # The model can often handle different sample rates
            audio_tensor = audio_tensor.squeeze(0)
-        print(f"Final audio tensor shape: {audio_tensor.shape}")
+        # Normalize audio to avoid issues
        if torch.abs(audio_tensor).max() > 0:
            audio_tensor = audio_tensor / torch.abs(audio_tensor).max()
        print(f"Final audio tensor: shape={audio_tensor.shape}, min={audio_tensor.min().item():.4f}, max={audio_tensor.max().item():.4f}")
        return audio_tensor
    except Exception as e:
-        print(f"Error decoding audio: {str(e)}")
+        print(f"Unhandled error in decode_audio_data: {str(e)}")
        # Return a small silent audio segment as fallback
        return torch.zeros(generator.sample_rate // 2)  # 0.5 seconds of silence
@@ -143,6 +232,8 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
        temp_path = os.path.join(base_dir, "temp_audio.wav")
        torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
        print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)")
        # Load and transcribe the audio
        audio = whisperx.load_audio(temp_path)
        result = asr_model.transcribe(audio, batch_size=16)
@@ -155,11 +246,15 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
        if result["segments"] and len(result["segments"]) > 0:
            # Combine all segments
            transcription = " ".join([segment["text"] for segment in result["segments"]])
            print(f"Transcription successful: '{transcription.strip()}'")
            return transcription.strip()
        else:
            print("Transcription returned no segments")
            return ""
    except Exception as e:
        print(f"Error in transcription: {str(e)}")
        import traceback
        traceback.print_exc()
        if os.path.exists("temp_audio.wav"):
            os.remove("temp_audio.wav")
        return ""
@@ -385,43 +480,73 @@ def handle_stream_audio(data):
            # Log the transcription
            print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
-            # Add to conversation context
+            # Handle the transcription result
            if transcribed_text:
                # Add user message to context
                user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
                client['context_segments'].append(user_segment)
                # Generate a contextual response
                response_text = generate_response(transcribed_text, client['context_segments'])
                # Send the transcribed text to client
                emit('transcription', {
                    'type': 'transcription',
                    'text': transcribed_text
                })
-                # Generate audio for the response
+                # Generate a contextual response
-                audio_tensor = generator.generate(
+                response_text = generate_response(transcribed_text, client['context_segments'])
-                    text=response_text,
+                print(f"[{client_id}] Generating audio response: '{response_text}'")
                    speaker=1 if speaker_id == 0 else 0,  # Use opposite speaker
                    context=client['context_segments'],
                    max_audio_length_ms=10_000,
                )
-                # Add response to context
+                # Let the client know we're processing
-                ai_segment = Segment(
+                emit('processing_status', {
-                    text=response_text, 
+                    'type': 'processing_status',
-                    speaker=1 if speaker_id == 0 else 0, 
+                    'status': 'generating_audio',
-                    audio=audio_tensor
+                    'message': 'Generating audio response...'
                )
                client['context_segments'].append(ai_segment)
                # Convert audio to base64 and send back to client
                audio_base64 = encode_audio_data(audio_tensor)
                emit('audio_response', {
                    'type': 'audio_response',
                    'text': response_text,
                    'audio': audio_base64
                })
                # Generate audio for the response
                try:
                    # Use a different speaker than the user
                    ai_speaker_id = 1 if speaker_id == 0 else 0
                    # Start audio generation with streaming (chunk by chunk)
                    audio_chunks = []
                    # This version tries to stream the audio generation in smaller chunks
                    # Note: CSM model doesn't natively support incremental generation,
                    # so we're simulating it here for a more responsive UI experience
                    # Generate the full response
                    audio_tensor = generator.generate(
                        text=response_text,
                        speaker=ai_speaker_id,
                        context=client['context_segments'],
                        max_audio_length_ms=10_000,
                    )
                    # Add response to context
                    ai_segment = Segment(
                        text=response_text, 
                        speaker=ai_speaker_id, 
                        audio=audio_tensor
                    )
                    client['context_segments'].append(ai_segment)
                    # Convert audio to base64 and send back to client
                    audio_base64 = encode_audio_data(audio_tensor)
                    emit('audio_response', {
                        'type': 'audio_response',
                        'text': response_text,
                        'audio': audio_base64
                    })
                    print(f"[{client_id}] Audio response sent: {len(audio_base64)} bytes")
                except Exception as gen_error:
                    print(f"Error generating audio response: {str(gen_error)}")
                    emit('error', {
                        'type': 'error',
                        'message': "Sorry, there was an error generating the audio response."
                    })
            else:
                # If transcription failed, send a generic response
                emit('error', {
@@ -437,6 +562,7 @@ def handle_stream_audio(data):
        # If buffer gets too large without silence, process it anyway
        elif len(client['streaming_buffer']) >= 30:  # ~6 seconds of audio at 5 chunks/sec
            print(f"[{client_id}] Processing long audio segment without silence")
            full_audio = torch.cat(client['streaming_buffer'], dim=0)
            # Process with WhisperX speech-to-text
@@ -453,7 +579,9 @@ def handle_stream_audio(data):
                    'text': transcribed_text + " (processing continued speech...)"
                })
-            client['streaming_buffer'] = []
+            # Keep half of the buffer for context (sliding window approach)
            half_point = len(client['streaming_buffer']) // 2
            client['streaming_buffer'] = client['streaming_buffer'][half_point:]
    except Exception as e:
        import traceback
@@ -497,6 +625,62 @@ def handle_stop_streaming(data):
        'status': 'stopped'
    })
 def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=500):
    """Stream audio to client in chunks to simulate real-time generation"""
    try:
        if client_id not in active_clients:
            print(f"Client {client_id} not found for streaming")
            return
        # Calculate chunk size in samples
        chunk_size = int(generator.sample_rate * chunk_size_ms / 1000)
        total_chunks = math.ceil(audio_tensor.size(0) / chunk_size)
        print(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each")
        # Send initial response with text but no audio yet
        socketio.emit('audio_response_start', {
            'type': 'audio_response_start',
            'text': text,
            'total_chunks': total_chunks
        }, room=client_id)
        # Stream each chunk
        for i in range(total_chunks):
            start_idx = i * chunk_size
            end_idx = min(start_idx + chunk_size, audio_tensor.size(0))
            # Extract chunk
            chunk = audio_tensor[start_idx:end_idx]
            # Encode chunk
            chunk_base64 = encode_audio_data(chunk)
            # Send chunk
            socketio.emit('audio_response_chunk', {
                'type': 'audio_response_chunk',
                'chunk_index': i,
                'total_chunks': total_chunks,
                'audio': chunk_base64,
                'is_last': i == total_chunks - 1
            }, room=client_id)
            # Brief pause between chunks to simulate streaming
            time.sleep(0.1)
        # Send completion message
        socketio.emit('audio_response_complete', {
            'type': 'audio_response_complete',
            'text': text
        }, room=client_id)
        print(f"Audio streaming complete: {total_chunks} chunks sent")
    except Exception as e:
        print(f"Error streaming audio to client: {str(e)}")
        import traceback
        traceback.print_exc()
 if __name__ == "__main__":
    print(f"\n{'='*60}")
    print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -466,37 +466,27 @@ function sendAudioChunk(audioData, speaker) {
        return;
    }
-    console.log(`Creating WAV from audio data: length=${audioData.length}`);
+    console.log(`Preparing audio chunk: length=${audioData.length}, speaker=${speaker}`);
    // Check for NaN or invalid values
-    let hasNaN = false;
+    let hasInvalidValues = false;
    let min = Infinity;
    let max = -Infinity;
    let sum = 0;
    for (let i = 0; i < audioData.length; i++) {
        if (isNaN(audioData[i]) || !isFinite(audioData[i])) {
-            hasNaN = true;
+            hasInvalidValues = true;
            console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`);
            break;
        }
        min = Math.min(min, audioData[i]);
        max = Math.max(max, audioData[i]);
        sum += audioData[i];
    }
-    if (hasNaN) {
+    if (hasInvalidValues) {
-        console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.');
+        console.warn('Audio data contains invalid values. Creating silent audio.');
        audioData = new Float32Array(audioData.length).fill(0);
    } else {
        const avg = sum / audioData.length;
        console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`);
    }
    try {
-        // Create WAV blob with proper format
+        // Create WAV blob
        const wavData = createWavBlob(audioData, 24000);
-        console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`);
+        console.log(`WAV blob created: ${wavData.size} bytes`);
        const reader = new FileReader();
@@ -504,28 +494,21 @@ function sendAudioChunk(audioData, speaker) {
            try {
                // Get base64 data
                const base64data = reader.result;
-                console.log(`Base64 data created: length=${base64data.length}`);
+                console.log(`Base64 data created: ${base64data.length} bytes`);
-                // Validate the base64 data before sending
+                // Send to server
                if (!base64data || base64data.length < 100) {
                    console.warn('Generated base64 data is too small or invalid');
                    return;
                }
                // Send the audio chunk to the server
                console.log('Sending audio data to server...');
                state.socket.emit('stream_audio', {
                    audio: base64data,
                    speaker: speaker
                });
-                console.log('Audio data sent successfully');
+                console.log('Audio chunk sent to server');
            } catch (err) {
                console.error('Error preparing audio data:', err);
            }
        };
-        reader.onerror = function(err) {
+        reader.onerror = function() {
-            console.error('Error reading audio data:', err);
+            console.error('Error reading audio data as base64');
        };
        reader.readAsDataURL(wavData);
@@ -534,19 +517,20 @@ function sendAudioChunk(audioData, speaker) {
    }
 }
-// Create WAV blob from audio data with validation
+// Create WAV blob from audio data with improved error handling
 function createWavBlob(audioData, sampleRate) {
-    // Check if audio data is valid
+    // Validate input
    if (!audioData || audioData.length === 0) {
-        console.warn('Empty audio data received');
+        console.warn('Empty audio data provided to createWavBlob');
-        // Return a tiny silent audio snippet instead
+        audioData = new Float32Array(1024).fill(0); // Create 1024 samples of silence
        audioData = new Float32Array(100).fill(0);
    }
    // Function to convert Float32Array to Int16Array for WAV format
    function floatTo16BitPCM(output, offset, input) {
        for (let i = 0; i < input.length; i++, offset += 2) {
            // Ensure values are in -1 to 1 range
            const s = Math.max(-1, Math.min(1, input[i]));
            // Convert to 16-bit PCM
            output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
        }
    }
@@ -558,40 +542,80 @@ function createWavBlob(audioData, sampleRate) {
        }
    }
-    // Create WAV file with header
+    try {
-    function encodeWAV(samples) {
+        // Create WAV file with header - careful with buffer sizes
-        const buffer = new ArrayBuffer(44 + samples.length * 2);
+        const buffer = new ArrayBuffer(44 + audioData.length * 2);
        const view = new DataView(buffer);
-        // RIFF chunk descriptor
+        // RIFF identifier
        writeString(view, 0, 'RIFF');
-        view.setUint32(4, 36 + samples.length * 2, true);
+        
        // File length (will be filled later)
        view.setUint32(4, 36 + audioData.length * 2, true);
        // WAVE identifier
        writeString(view, 8, 'WAVE');
-        // fmt sub-chunk
+        // fmt chunk identifier
        writeString(view, 12, 'fmt ');
        // fmt chunk length
        view.setUint32(16, 16, true);
-        view.setUint16(20, 1, true); // PCM format
+        
-        view.setUint16(22, 1, true); // Mono channel
+        // Sample format (1 is PCM)
        view.setUint16(20, 1, true);
        // Mono channel
        view.setUint16(22, 1, true);
        // Sample rate
        view.setUint32(24, sampleRate, true);
        view.setUint32(28, sampleRate * 2, true); // Byte rate
        view.setUint16(32, 2, true); // Block align
        view.setUint16(34, 16, true); // Bits per sample
-        // data sub-chunk
+        // Byte rate (sample rate * block align)
        view.setUint32(28, sampleRate * 2, true);
        // Block align (channels * bytes per sample)
        view.setUint16(32, 2, true);
        // Bits per sample
        view.setUint16(34, 16, true);
        // data chunk identifier
        writeString(view, 36, 'data');
        view.setUint32(40, samples.length * 2, true);
        floatTo16BitPCM(view, 44, samples);
-        return buffer;
+        // data chunk length
        view.setUint32(40, audioData.length * 2, true);
        // Write the PCM samples
        floatTo16BitPCM(view, 44, audioData);
        // Create and return blob
        return new Blob([view], { type: 'audio/wav' });
    } catch (err) {
        console.error('Error in createWavBlob:', err);
        // Create a minimal valid WAV file with silence as fallback
        const fallbackSamples = new Float32Array(1024).fill(0);
        const fallbackBuffer = new ArrayBuffer(44 + fallbackSamples.length * 2);
        const fallbackView = new DataView(fallbackBuffer);
        writeString(fallbackView, 0, 'RIFF');
        fallbackView.setUint32(4, 36 + fallbackSamples.length * 2, true);
        writeString(fallbackView, 8, 'WAVE');
        writeString(fallbackView, 12, 'fmt ');
        fallbackView.setUint32(16, 16, true);
        fallbackView.setUint16(20, 1, true);
        fallbackView.setUint16(22, 1, true);
        fallbackView.setUint32(24, sampleRate, true);
        fallbackView.setUint32(28, sampleRate * 2, true);
        fallbackView.setUint16(32, 2, true);
        fallbackView.setUint16(34, 16, true);
        writeString(fallbackView, 36, 'data');
        fallbackView.setUint32(40, fallbackSamples.length * 2, true);
        floatTo16BitPCM(fallbackView, 44, fallbackSamples);
        return new Blob([fallbackView], { type: 'audio/wav' });
    }
    // Convert audio data to TypedArray if it's a regular Array
    const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
    // Create WAV blob
    const wavBuffer = encodeWAV(samples);
    return new Blob([wavBuffer], { type: 'audio/wav' });
 }
 // Draw audio visualizer