diff --git a/Backend/index.html b/Backend/index.html index 539fe09..309364f 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -1,4 +1,3 @@ -/index.html @@ -93,6 +92,16 @@ let mediaRecorder; let audioChunks = []; let isRecording = false; + let audioContext; + let streamProcessor; + let isStreaming = false; + let streamButton; + let isSpeaking = false; + let silenceTimer = null; + let energyWindow = []; + const ENERGY_WINDOW_SIZE = 10; + const CLIENT_SILENCE_THRESHOLD = 0.01; + const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second // DOM elements const conversationEl = document.getElementById('conversation'); @@ -102,6 +111,271 @@ const recordAudioBtn = document.getElementById('recordAudio'); const clearContextBtn = document.getElementById('clearContext'); + // Add streaming button to the input row + window.addEventListener('load', () => { + const inputRow = document.querySelector('.input-row:nth-child(2)'); + streamButton = document.createElement('button'); + streamButton.id = 'streamAudio'; + streamButton.textContent = 'Start Streaming'; + streamButton.addEventListener('click', toggleStreaming); + inputRow.appendChild(streamButton); + + connectWebSocket(); + setupRecording(); + setupAudioContext(); + }); + + // Setup audio context for streaming + function setupAudioContext() { + try { + audioContext = new (window.AudioContext || window.webkitAudioContext)(); + console.log('Audio context setup completed'); + } catch (err) { + console.error('Error setting up audio context:', err); + addSystemMessage(`Audio context error: ${err.message}`); + } + } + + // Toggle audio streaming + async function toggleStreaming() { + if (isStreaming) { + stopStreaming(); + } else { + startStreaming(); + } + } + + // Start audio streaming with silence detection + async function startStreaming() { + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const speaker = parseInt(speakerSelectEl.value); + + isStreaming = true; + isSpeaking = false; + energyWindow = []; + + streamButton.textContent = 'Speaking...'; + streamButton.classList.add('recording'); + + // Create audio processor node + const source = audioContext.createMediaStreamSource(stream); + streamProcessor = audioContext.createScriptProcessor(4096, 1, 1); + + // Process and send audio data + streamProcessor.onaudioprocess = function(e) { + const audioData = e.inputBuffer.getChannelData(0); + + // Calculate energy (volume) for silence detection + const energy = calculateAudioEnergy(audioData); + updateEnergyWindow(energy); + + // Check if currently silent + const avgEnergy = calculateAverageEnergy(); + const isSilent = avgEnergy < CLIENT_SILENCE_THRESHOLD; + + // Handle silence/speech transitions for visual feedback + handleSpeechState(isSilent); + + // Continue processing audio regardless of silence state + const downsampled = downsampleBuffer(audioData, audioContext.sampleRate, 24000); + sendAudioChunk(downsampled, speaker); + }; + + // Connect the nodes + source.connect(streamProcessor); + streamProcessor.connect(audioContext.destination); + + addSystemMessage('Audio streaming started - speak naturally and pause when finished'); + + } catch (err) { + console.error('Error starting audio stream:', err); + addSystemMessage(`Streaming error: ${err.message}`); + isStreaming = false; + streamButton.textContent = 'Start Streaming'; + streamButton.classList.remove('recording'); + } + } + + // Calculate audio energy (volume) + function calculateAudioEnergy(buffer) { + let sum = 0; + for (let i = 0; i < buffer.length; i++) { + sum += Math.abs(buffer[i]); + } + return sum / buffer.length; + } + + // Update the sliding energy window + function updateEnergyWindow(energy) { + energyWindow.push(energy); + if (energyWindow.length > ENERGY_WINDOW_SIZE) { + energyWindow.shift(); + } + } + + // Calculate average energy from the window + function calculateAverageEnergy() { + if (energyWindow.length === 0) return 0; + return energyWindow.reduce((sum, val) => sum + val, 0) / energyWindow.length; + } + + // Handle speech state changes and visual feedback + function handleSpeechState(isSilent) { + if (isSpeaking && isSilent) { + // Transition from speaking to silence + if (!silenceTimer) { + silenceTimer = setTimeout(() => { + // Silence persisted long enough + streamButton.textContent = 'Processing...'; + streamButton.style.backgroundColor = '#FFA500'; // Orange + addSystemMessage('Detected pause in speech, processing response...'); + }, CLIENT_SILENCE_DURATION_MS); + } + } else if (!isSpeaking && !isSilent) { + // Transition from silence to speaking + isSpeaking = true; + streamButton.textContent = 'Speaking...'; + streamButton.style.backgroundColor = '#f44336'; // Red + + // Clear any pending silence timer + if (silenceTimer) { + clearTimeout(silenceTimer); + silenceTimer = null; + } + } else if (isSpeaking && !isSilent) { + // Still speaking, reset any silence timer + if (silenceTimer) { + clearTimeout(silenceTimer); + silenceTimer = null; + } + } + + // Update speaking state + if (!isSilent) { + isSpeaking = true; + } + } + + // Send audio chunk to server + function sendAudioChunk(audioData, speaker) { + const wavData = createWavBlob(audioData, 24000); + const reader = new FileReader(); + + reader.onloadend = function() { + const base64data = reader.result; + + // Send to server + ws.send(JSON.stringify({ + action: 'stream_audio', + speaker: speaker, + audio: base64data + })); + }; + + reader.readAsDataURL(wavData); + } + + // Stop audio streaming + function stopStreaming() { + if (streamProcessor) { + streamProcessor.disconnect(); + streamProcessor = null; + } + + // Clear any pending silence timer + if (silenceTimer) { + clearTimeout(silenceTimer); + silenceTimer = null; + } + + isStreaming = false; + isSpeaking = false; + energyWindow = []; + + streamButton.textContent = 'Start Streaming'; + streamButton.classList.remove('recording'); + streamButton.style.backgroundColor = ''; // Reset to default + + addSystemMessage('Audio streaming stopped'); + + // Send stop streaming signal to server + ws.send(JSON.stringify({ + action: 'stop_streaming', + speaker: parseInt(speakerSelectEl.value) + })); + } + + // Downsample audio buffer to target sample rate + function downsampleBuffer(buffer, sampleRate, targetSampleRate) { + if (targetSampleRate === sampleRate) { + return buffer; + } + + const sampleRateRatio = sampleRate / targetSampleRate; + const newLength = Math.round(buffer.length / sampleRateRatio); + const result = new Float32Array(newLength); + + let offsetResult = 0; + let offsetBuffer = 0; + + while (offsetResult < result.length) { + const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio); + let accum = 0, count = 0; + + for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) { + accum += buffer[i]; + count++; + } + + result[offsetResult] = accum / count; + offsetResult++; + offsetBuffer = nextOffsetBuffer; + } + + return result; + } + + // Create WAV blob from Float32Array + function createWavBlob(samples, sampleRate) { + const buffer = new ArrayBuffer(44 + samples.length * 2); + const view = new DataView(buffer); + + // RIFF chunk descriptor + writeString(view, 0, 'RIFF'); + view.setUint32(4, 36 + samples.length * 2, true); + writeString(view, 8, 'WAVE'); + + // fmt sub-chunk + writeString(view, 12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); // PCM format + view.setUint16(22, 1, true); // Mono channel + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); + view.setUint16(32, 2, true); + view.setUint16(34, 16, true); + + // data sub-chunk + writeString(view, 36, 'data'); + view.setUint32(40, samples.length * 2, true); + + // Write the PCM samples + const volume = 0.5; + for (let i = 0; i < samples.length; i++) { + const sample = Math.max(-1, Math.min(1, samples[i])); + view.setInt16(44 + i * 2, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true); + } + + return new Blob([buffer], { type: 'audio/wav' }); + } + + function writeString(view, offset, string) { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + } + // Connect to WebSocket function connectWebSocket() { const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; @@ -125,10 +399,19 @@ // Add message to conversation addAIMessage(response.audio); + + // Reset the streaming button if we're still in streaming mode + if (isStreaming) { + streamButton.textContent = 'Speaking...'; + streamButton.style.backgroundColor = '#f44336'; // Back to red + isSpeaking = false; // Reset speaking state + } } else if (response.type === 'error') { addSystemMessage(`Error: ${response.message}`); } else if (response.type === 'context_updated') { addSystemMessage(response.message); + } else if (response.type === 'streaming_status') { + addSystemMessage(`Streaming ${response.status}`); } }; diff --git a/Backend/server.py b/Backend/server.py index e8ed1ae..bfdc590 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -12,6 +12,8 @@ from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from generator import load_csm_1b, Segment import uvicorn +import time +from collections import deque # Select device if torch.cuda.is_available(): @@ -48,6 +50,9 @@ class ConnectionManager: manager = ConnectionManager() +# Silence detection parameters +SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization +SILENCE_DURATION_SEC = 1.0 # How long silence must persist to be considered "stopped talking" # Helper function to convert audio data async def decode_audio_data(audio_data: str) -> torch.Tensor: @@ -92,6 +97,13 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str: async def websocket_endpoint(websocket: WebSocket): await manager.connect(websocket) context_segments = [] # Store conversation context + streaming_buffer = [] # Buffer for streaming audio chunks + is_streaming = False + + # Variables for silence detection + last_active_time = time.time() + is_silence = False + energy_window = deque(maxlen=10) # For tracking recent audio energy try: while True: @@ -160,6 +172,114 @@ async def websocket_endpoint(websocket: WebSocket): "type": "context_updated", "message": "Context cleared" }) + + elif action == "stream_audio": + try: + speaker_id = request.get("speaker", 0) + audio_data = request.get("audio", "") + + # Convert received audio to tensor + audio_chunk = await decode_audio_data(audio_data) + + # Start streaming mode if not already started + if not is_streaming: + is_streaming = True + streaming_buffer = [] + energy_window.clear() + is_silence = False + last_active_time = time.time() + await websocket.send_json({ + "type": "streaming_status", + "status": "started" + }) + + # Calculate audio energy for silence detection + chunk_energy = torch.mean(torch.abs(audio_chunk)).item() + energy_window.append(chunk_energy) + avg_energy = sum(energy_window) / len(energy_window) + + # Check if audio is silent + current_silence = avg_energy < SILENCE_THRESHOLD + + # Track silence transition + if not is_silence and current_silence: + # Transition to silence + is_silence = True + last_active_time = time.time() + elif is_silence and not current_silence: + # User started talking again + is_silence = False + + # Add chunk to buffer regardless of silence state + streaming_buffer.append(audio_chunk) + + # Check if silence has persisted long enough to consider "stopped talking" + silence_elapsed = time.time() - last_active_time + + if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0: + # User has stopped talking - process the collected audio + full_audio = torch.cat(streaming_buffer, dim=0) + + # Process with speech-to-text (you would need to implement this) + # For now, just use a placeholder text + text = f"User audio from speaker {speaker_id}" + + print(f"Detected end of speech, processing {len(streaming_buffer)} chunks") + + # Add to conversation context + context_segments.append(Segment(text=text, speaker=speaker_id, audio=full_audio)) + + # Generate response + response_text = "This is a response to what you just said" + audio_tensor = generator.generate( + text=response_text, + speaker=1 if speaker_id == 0 else 0, # Use opposite speaker + context=context_segments, + max_audio_length_ms=10_000, + ) + + # Convert audio to base64 and send back to client + audio_base64 = await encode_audio_data(audio_tensor) + await websocket.send_json({ + "type": "audio_response", + "audio": audio_base64 + }) + + # Clear buffer and reset silence detection + streaming_buffer = [] + energy_window.clear() + is_silence = False + last_active_time = time.time() + + # If buffer gets too large without silence, process it anyway + # This prevents memory issues with very long streams + elif len(streaming_buffer) >= 30: # ~6 seconds of audio at 5 chunks/sec + print("Buffer limit reached, processing audio") + full_audio = torch.cat(streaming_buffer, dim=0) + text = f"Continued speech from speaker {speaker_id}" + context_segments.append(Segment(text=text, speaker=speaker_id, audio=full_audio)) + streaming_buffer = [] + + except Exception as e: + print(f"Error processing streaming audio: {str(e)}") + await websocket.send_json({ + "type": "error", + "message": f"Error processing streaming audio: {str(e)}" + }) + + elif action == "stop_streaming": + is_streaming = False + if streaming_buffer: + # Process any remaining audio in the buffer + full_audio = torch.cat(streaming_buffer, dim=0) + text = f"Final streaming audio from speaker {request.get('speaker', 0)}" + context_segments.append(Segment(text=text, speaker=request.get("speaker", 0), audio=full_audio)) + + streaming_buffer = [] + await websocket.send_json({ + "type": "streaming_status", + "status": "stopped" + }) except WebSocketDisconnect: manager.disconnect(websocket)