merged call page

2025-03-30 07:38:22 -04:00
parent 1836161d47 2d8ac95978
commit 56d4807379
4 changed files with 337 additions and 44 deletions
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -93,15 +93,49 @@ def load_speech_models():
    # Load Whisper model for speech recognition
    try:
        logger.info(f"Loading speech recognition model on {device}...")
-        speech_recognizer = pipeline("automatic-speech-recognition", 
+        
-                                    model="openai/whisper-small", 
+        # Try with newer API first
-                                    device=device)
+        try:
            from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
            model_id = "openai/whisper-small"
            # Load model and processor
            model = AutoModelForSpeechSeq2Seq.from_pretrained(
                model_id,
                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
                device_map=device,
            )
            processor = AutoProcessor.from_pretrained(model_id)
            # Create pipeline with specific parameters
            speech_recognizer = pipeline(
                "automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                max_new_tokens=128,
                chunk_length_s=30,
                batch_size=16,
                device=device,
            )
        except Exception as api_error:
            logger.warning(f"Newer API loading failed: {api_error}, trying simpler approach")
            # Fallback to simpler API
            speech_recognizer = pipeline(
                "automatic-speech-recognition", 
                model="openai/whisper-small", 
                device=device
            )
        logger.info("Speech recognition model loaded successfully")
        return generator, speech_recognizer
    except Exception as e:
        logger.error(f"Error loading speech recognition model: {e}")
-        speech_recognizer = None
+        return generator, None
    return generator, speech_recognizer
 # Unpack both models
 generator, speech_recognizer = load_speech_models()
@@ -308,9 +342,28 @@ def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str:
        temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav")
        torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
-        # Perform speech recognition
+        # Perform speech recognition - handle the warning differently
-        result = speech_recognizer(temp_path)
+        # Just pass the path without any additional parameters
-        transcription = result["text"]
+        try:
            # First try - use default parameters
            result = speech_recognizer(temp_path)
            transcription = result["text"]
        except Exception as whisper_error:
            logger.warning(f"First transcription attempt failed: {whisper_error}")
            # Try with explicit parameters for older versions of transformers
            import numpy as np
            import soundfile as sf
            # Load audio as numpy array
            audio_np, sr = sf.read(temp_path)
            if sr != 16000:
                # Whisper expects 16kHz audio
                from scipy import signal
                audio_np = signal.resample(audio_np, int(len(audio_np) * 16000 / sr))
            # Try with numpy array directly
            result = speech_recognizer(audio_np)
            transcription = result["text"]
        # Clean up temp file
        if os.path.exists(temp_path):
@@ -320,6 +373,7 @@ def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str:
        if not transcription or transcription.isspace():
            return "I didn't detect any speech. Could you please try again?"
        logger.info(f"Transcription successful: '{transcription}'")
        return transcription
    except Exception as e:
@@ -650,7 +704,7 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
        # Combine audio chunks
        full_audio = torch.cat(client['streaming_buffer'], dim=0)
-        # Process audio to generate a response (no speech recognition)
+        # Process audio to generate a response (using speech recognition)
        generated_text = process_speech(full_audio, client_id)
        # Add suffix for incomplete utterances
@@ -706,16 +760,28 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
                    )
                    client['context_segments'].append(ai_segment)
-                    # Convert audio to base64 and send back to client
+                    # CHANGE HERE: Use the streaming function instead of sending all at once
-                    audio_base64 = encode_audio_data(audio_tensor)
+                    # Check if the audio is short enough to send at once or if it should be streamed
-                    emit('audio_response', {
+                    if audio_tensor.size(0) < generator.sample_rate * 2:  # Less than 2 seconds
-                        'type': 'audio_response',
+                        # For short responses, just send in one go for better responsiveness
-                        'text': response_text,
+                        audio_base64 = encode_audio_data(audio_tensor)
-                        'audio': audio_base64
+                        emit('audio_response', {
-                    }, room=client_id)
+                            'type': 'audio_response',
-                    
+                            'text': response_text,
-                    logger.info(f"[{client_id[:8]}] Audio response sent")
+                            'audio': audio_base64
-                    
+                        }, room=client_id)
                        logger.info(f"[{client_id[:8]}] Short audio response sent in one piece")
                    else:
                        # For longer responses, use streaming
                        logger.info(f"[{client_id[:8]}] Using streaming for audio response")
                        # Start a new thread for streaming to avoid blocking the main thread
                        import threading
                        stream_thread = threading.Thread(
                            target=stream_audio_to_client,
                            args=(client_id, audio_tensor, response_text, ai_speaker_id)
                        )
                        stream_thread.start()
                except Exception as e:
                    logger.error(f"Error generating audio response: {e}")
                    emit('error', {
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -50,6 +50,20 @@ let canvasContext = null;
 let visualizerBufferLength = 0;
 let visualizerDataArray = null;
 // New state variables to track incremental audio streaming
 const streamingAudio = {
    messageElement: null,
    audioElement: null,
    chunks: [],
    totalChunks: 0,
    receivedChunks: 0,
    text: '',
    mediaSource: null,
    sourceBuffer: null,
    audioContext: null,
    complete: false
 };
 // Initialize the application
 function initializeApp() {
    // Initialize the UI elements
@@ -116,6 +130,12 @@ function setupSocketConnection() {
    state.socket.on('transcription', handleTranscription);
    state.socket.on('context_updated', handleContextUpdate);
    state.socket.on('streaming_status', handleStreamingStatus);
    // New event handlers for incremental audio streaming
    state.socket.on('audio_response_start', handleAudioResponseStart);
    state.socket.on('audio_response_chunk', handleAudioResponseChunk);
    state.socket.on('audio_response_complete', handleAudioResponseComplete);
    state.socket.on('processing_status', handleProcessingStatus);
 }
 // Setup event listeners
@@ -294,12 +314,8 @@ function stopStreaming(notifyServer = true) {
 function handleAudioProcess(event) {
    const inputData = event.inputBuffer.getChannelData(0);
    // Log audio buffer statistics
    console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`);
    // Calculate audio energy (volume level)
    const energy = calculateAudioEnergy(inputData);
    console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`);
    // Update energy window for averaging
    updateEnergyWindow(energy);
@@ -309,7 +325,11 @@ function handleAudioProcess(event) {
    // Determine if audio is silent
    const isSilent = avgEnergy < state.silenceThreshold;
-    console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`);
+    
    // Debug logging only if significant changes in audio patterns
    if (Math.random() < 0.05) { // Log only 5% of frames to avoid console spam
        console.log(`Audio: len=${inputData.length}, energy=${energy.toFixed(4)}, avg=${avgEnergy.toFixed(4)}, silent=${isSilent}`);
    }
    // Handle speech state based on silence
    handleSpeechState(isSilent);
@@ -319,7 +339,6 @@ function handleAudioProcess(event) {
        // Create a resampled version at 24kHz for the server
        // Most WebRTC audio is 48kHz, but we want 24kHz for the model
        const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
        console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`);
        // Send the audio chunk to the server
        sendAudioChunk(resampledData, state.currentSpeaker);
@@ -847,6 +866,206 @@ function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
    return result;
 }
 // Handle processing status updates
 function handleProcessingStatus(data) {
    console.log('Processing status update:', data);
    // Show processing status in UI
    if (data.status === 'generating_audio') {
        elements.streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
        elements.streamButton.classList.add('processing');
        elements.streamButton.classList.remove('recording');
        // Show message to user
        addSystemMessage(data.message || 'Processing your request...');
    }
 }
 // Handle the start of an audio streaming response
 function handleAudioResponseStart(data) {
    console.log('Audio response starting:', data);
    // Reset streaming audio state
    streamingAudio.chunks = [];
    streamingAudio.totalChunks = data.total_chunks;
    streamingAudio.receivedChunks = 0;
    streamingAudio.text = data.text;
    streamingAudio.complete = false;
    // Create message container now, so we can update it as chunks arrive
    const messageElement = document.createElement('div');
    messageElement.className = 'message ai processing';
    // Add text content if available
    if (data.text) {
        const textElement = document.createElement('p');
        textElement.textContent = data.text;
        messageElement.appendChild(textElement);
    }
    // Create audio element (will be populated as chunks arrive)
    const audioElement = document.createElement('audio');
    audioElement.controls = true;
    audioElement.className = 'audio-player';
    audioElement.textContent = 'Audio is being generated...';
    messageElement.appendChild(audioElement);
    // Add timestamp
    const timeElement = document.createElement('span');
    timeElement.className = 'message-time';
    timeElement.textContent = new Date().toLocaleTimeString();
    messageElement.appendChild(timeElement);
    // Add loading indicator
    const loadingElement = document.createElement('div');
    loadingElement.className = 'loading-indicator';
    loadingElement.innerHTML = '<div class="loading-spinner"></div><span>Generating audio response...</span>';
    messageElement.appendChild(loadingElement);
    // Add to conversation
    elements.conversation.appendChild(messageElement);
    // Auto-scroll to bottom
    elements.conversation.scrollTop = elements.conversation.scrollHeight;
    // Store elements for later updates
    streamingAudio.messageElement = messageElement;
    streamingAudio.audioElement = audioElement;
 }
 // Handle an incoming audio chunk
 function handleAudioResponseChunk(data) {
    console.log(`Received audio chunk ${data.chunk_index + 1}/${data.total_chunks}`);
    // Store the chunk
    streamingAudio.chunks[data.chunk_index] = data.audio;
    streamingAudio.receivedChunks++;
    // Update progress in the UI
    if (streamingAudio.messageElement) {
        const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator span');
        if (loadingElement) {
            loadingElement.textContent = `Generating audio response... ${Math.round((streamingAudio.receivedChunks / data.total_chunks) * 100)}%`;
        }
    }
    // If this is the first chunk, start playing it immediately for faster response
    if (data.chunk_index === 0 && streamingAudio.audioElement && elements.autoPlayResponses && elements.autoPlayResponses.checked) {
        try {
            streamingAudio.audioElement.src = data.audio;
            streamingAudio.audioElement.play().catch(err => console.warn('Auto-play failed:', err));
        } catch (e) {
            console.error('Error playing first chunk:', e);
        }
    }
    // If this is the last chunk or we've received all chunks, finalize the audio
    if (data.is_last || streamingAudio.receivedChunks >= data.total_chunks) {
        finalizeStreamingAudio();
    }
 }
 // Handle completion of audio streaming
 function handleAudioResponseComplete(data) {
    console.log('Audio response complete:', data);
    streamingAudio.complete = true;
    // Make sure we finalize the audio even if some chunks were missed
    finalizeStreamingAudio();
    // Update UI to normal state
    if (state.isStreaming) {
        elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
        elements.streamButton.classList.add('recording');
        elements.streamButton.classList.remove('processing');
    }
 }
 // Finalize streaming audio by combining chunks and updating the UI
 function finalizeStreamingAudio() {
    if (!streamingAudio.messageElement || streamingAudio.chunks.length === 0) {
        return;
    }
    try {
        // For more sophisticated audio streaming, you would need to properly concatenate
        // the WAV files, but for now we'll use the last chunk as the complete audio
        // since it should contain the entire response due to how the server is implementing it
        const lastChunkIndex = streamingAudio.chunks.length - 1;
        const audioData = streamingAudio.chunks[lastChunkIndex] || streamingAudio.chunks[0];
        // Update the audio element with the complete audio
        if (streamingAudio.audioElement) {
            streamingAudio.audioElement.src = audioData;
            // Auto-play if enabled and not already playing
            if (elements.autoPlayResponses && elements.autoPlayResponses.checked && 
                streamingAudio.audioElement.paused) {
                streamingAudio.audioElement.play()
                    .catch(err => {
                        console.warn('Auto-play failed:', err);
                        addSystemMessage('Auto-play failed. Please click play to hear the response.');
                    });
            }
        }
        // Remove loading indicator and processing class
        if (streamingAudio.messageElement) {
            const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator');
            if (loadingElement) {
                streamingAudio.messageElement.removeChild(loadingElement);
            }
            streamingAudio.messageElement.classList.remove('processing');
        }
        console.log('Audio response finalized and ready for playback');
    } catch (e) {
        console.error('Error finalizing streaming audio:', e);
    }
    // Reset streaming audio state
    streamingAudio.chunks = [];
    streamingAudio.totalChunks = 0;
    streamingAudio.receivedChunks = 0;
    streamingAudio.messageElement = null;
    streamingAudio.audioElement = null;
 }
 // Add CSS styles for new UI elements
 document.addEventListener('DOMContentLoaded', function() {
    // Add styles for processing state
    const style = document.createElement('style');
    style.textContent = `
        .message.processing {
            opacity: 0.8;
        }
        .loading-indicator {
            display: flex;
            align-items: center;
            margin-top: 8px;
            font-size: 0.9em;
            color: #666;
        }
        .loading-spinner {
            width: 16px;
            height: 16px;
            border: 2px solid #ddd;
            border-top: 2px solid var(--primary-color);
            border-radius: 50%;
            margin-right: 8px;
            animation: spin 1s linear infinite;
        }
        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }
    `;
    document.head.appendChild(style);
 });
 // Initialize the application when DOM is fully loaded
 document.addEventListener('DOMContentLoaded', initializeApp);
--- a/React/src/app/call/page.tsx
+++ b/React/src/app/call/page.tsx
@@ -69,6 +69,25 @@ function CallPage() {
    audio.play();
  };
  const handleEmergency = async () => {
 		// send texts
 		const response = await fetch("/api/sendMessage", {
 			method: "POST",
 			headers: {
 				"Content-Type": "application/json",
 			},
 			body: JSON.stringify({
 				message: `yo i need help`,
 			}),
 		});
 		if (!response.ok) {
 			console.error("Error sending message:", response.statusText);
 			return;
 		}
 	}
  return (
    <div className="grid grid-rows-[20px_1fr_20px] items-center justify-items-center min-h-screen p-8 pb-20 gap-16 sm:p-20 font-[family-name:var(--font-geist-sans)]">
      <main className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start">
@@ -94,7 +113,7 @@ function CallPage() {
          </>
        )}
-        <button className="bg-red-500 text-white rounded-md p-2">Emergency</button>
+        <button onClick={handleEmergency} className="bg-red-500 text-white rounded-md p-2">Emergency</button>
        <button className="bg-blue-500 text-white rounded-md p-2"
          onClick={() => {
            window.location.href = '/';
--- a/React/src/app/page.tsx
+++ b/React/src/app/page.tsx
@@ -1,8 +1,14 @@
 "use client";
 import { useState } from "react";
 import { auth0 } from "../lib/auth0";
 import { NextApiRequest, NextApiResponse } from "next";
 export default async function Home() {
 	const [contacts, setContacts] = useState<string[]>([]);
 	const [codeword, setCodeword] = useState("");
@@ -10,23 +16,6 @@ export default async function Home() {
 	console.log("Session:", session?.user);
 	const handleEmergency = async () => {
 		// send texts
 		const response = await fetch("/api/sendMessage", {
 			method: "POST",
 			headers: {
 				"Content-Type": "application/json",
 			},
 			body: JSON.stringify({
 				message: `yo i need help`,
 			}),
 		});
 		if (!response.ok) {
 			console.error("Error sending message:", response.statusText);
 			return;
 		}
 	}
 	// If no session, show sign-up and login buttons
 	if (!session) {