merged call page

This commit is contained in:
idler-wheel
2025-03-30 07:38:22 -04:00
4 changed files with 337 additions and 44 deletions

View File

@@ -93,15 +93,49 @@ def load_speech_models():
# Load Whisper model for speech recognition # Load Whisper model for speech recognition
try: try:
logger.info(f"Loading speech recognition model on {device}...") logger.info(f"Loading speech recognition model on {device}...")
speech_recognizer = pipeline("automatic-speech-recognition",
model="openai/whisper-small", # Try with newer API first
device=device) try:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
model_id = "openai/whisper-small"
# Load model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map=device,
)
processor = AutoProcessor.from_pretrained(model_id)
# Create pipeline with specific parameters
speech_recognizer = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=16,
device=device,
)
except Exception as api_error:
logger.warning(f"Newer API loading failed: {api_error}, trying simpler approach")
# Fallback to simpler API
speech_recognizer = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small",
device=device
)
logger.info("Speech recognition model loaded successfully") logger.info("Speech recognition model loaded successfully")
return generator, speech_recognizer
except Exception as e: except Exception as e:
logger.error(f"Error loading speech recognition model: {e}") logger.error(f"Error loading speech recognition model: {e}")
speech_recognizer = None return generator, None
return generator, speech_recognizer
# Unpack both models # Unpack both models
generator, speech_recognizer = load_speech_models() generator, speech_recognizer = load_speech_models()
@@ -308,9 +342,28 @@ def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str:
temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav") temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav")
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
# Perform speech recognition # Perform speech recognition - handle the warning differently
result = speech_recognizer(temp_path) # Just pass the path without any additional parameters
transcription = result["text"] try:
# First try - use default parameters
result = speech_recognizer(temp_path)
transcription = result["text"]
except Exception as whisper_error:
logger.warning(f"First transcription attempt failed: {whisper_error}")
# Try with explicit parameters for older versions of transformers
import numpy as np
import soundfile as sf
# Load audio as numpy array
audio_np, sr = sf.read(temp_path)
if sr != 16000:
# Whisper expects 16kHz audio
from scipy import signal
audio_np = signal.resample(audio_np, int(len(audio_np) * 16000 / sr))
# Try with numpy array directly
result = speech_recognizer(audio_np)
transcription = result["text"]
# Clean up temp file # Clean up temp file
if os.path.exists(temp_path): if os.path.exists(temp_path):
@@ -320,6 +373,7 @@ def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str:
if not transcription or transcription.isspace(): if not transcription or transcription.isspace():
return "I didn't detect any speech. Could you please try again?" return "I didn't detect any speech. Could you please try again?"
logger.info(f"Transcription successful: '{transcription}'")
return transcription return transcription
except Exception as e: except Exception as e:
@@ -650,7 +704,7 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
# Combine audio chunks # Combine audio chunks
full_audio = torch.cat(client['streaming_buffer'], dim=0) full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process audio to generate a response (no speech recognition) # Process audio to generate a response (using speech recognition)
generated_text = process_speech(full_audio, client_id) generated_text = process_speech(full_audio, client_id)
# Add suffix for incomplete utterances # Add suffix for incomplete utterances
@@ -706,16 +760,28 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
) )
client['context_segments'].append(ai_segment) client['context_segments'].append(ai_segment)
# Convert audio to base64 and send back to client # CHANGE HERE: Use the streaming function instead of sending all at once
audio_base64 = encode_audio_data(audio_tensor) # Check if the audio is short enough to send at once or if it should be streamed
emit('audio_response', { if audio_tensor.size(0) < generator.sample_rate * 2: # Less than 2 seconds
'type': 'audio_response', # For short responses, just send in one go for better responsiveness
'text': response_text, audio_base64 = encode_audio_data(audio_tensor)
'audio': audio_base64 emit('audio_response', {
}, room=client_id) 'type': 'audio_response',
'text': response_text,
logger.info(f"[{client_id[:8]}] Audio response sent") 'audio': audio_base64
}, room=client_id)
logger.info(f"[{client_id[:8]}] Short audio response sent in one piece")
else:
# For longer responses, use streaming
logger.info(f"[{client_id[:8]}] Using streaming for audio response")
# Start a new thread for streaming to avoid blocking the main thread
import threading
stream_thread = threading.Thread(
target=stream_audio_to_client,
args=(client_id, audio_tensor, response_text, ai_speaker_id)
)
stream_thread.start()
except Exception as e: except Exception as e:
logger.error(f"Error generating audio response: {e}") logger.error(f"Error generating audio response: {e}")
emit('error', { emit('error', {

View File

@@ -50,6 +50,20 @@ let canvasContext = null;
let visualizerBufferLength = 0; let visualizerBufferLength = 0;
let visualizerDataArray = null; let visualizerDataArray = null;
// New state variables to track incremental audio streaming
const streamingAudio = {
messageElement: null,
audioElement: null,
chunks: [],
totalChunks: 0,
receivedChunks: 0,
text: '',
mediaSource: null,
sourceBuffer: null,
audioContext: null,
complete: false
};
// Initialize the application // Initialize the application
function initializeApp() { function initializeApp() {
// Initialize the UI elements // Initialize the UI elements
@@ -116,6 +130,12 @@ function setupSocketConnection() {
state.socket.on('transcription', handleTranscription); state.socket.on('transcription', handleTranscription);
state.socket.on('context_updated', handleContextUpdate); state.socket.on('context_updated', handleContextUpdate);
state.socket.on('streaming_status', handleStreamingStatus); state.socket.on('streaming_status', handleStreamingStatus);
// New event handlers for incremental audio streaming
state.socket.on('audio_response_start', handleAudioResponseStart);
state.socket.on('audio_response_chunk', handleAudioResponseChunk);
state.socket.on('audio_response_complete', handleAudioResponseComplete);
state.socket.on('processing_status', handleProcessingStatus);
} }
// Setup event listeners // Setup event listeners
@@ -294,12 +314,8 @@ function stopStreaming(notifyServer = true) {
function handleAudioProcess(event) { function handleAudioProcess(event) {
const inputData = event.inputBuffer.getChannelData(0); const inputData = event.inputBuffer.getChannelData(0);
// Log audio buffer statistics
console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`);
// Calculate audio energy (volume level) // Calculate audio energy (volume level)
const energy = calculateAudioEnergy(inputData); const energy = calculateAudioEnergy(inputData);
console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`);
// Update energy window for averaging // Update energy window for averaging
updateEnergyWindow(energy); updateEnergyWindow(energy);
@@ -309,7 +325,11 @@ function handleAudioProcess(event) {
// Determine if audio is silent // Determine if audio is silent
const isSilent = avgEnergy < state.silenceThreshold; const isSilent = avgEnergy < state.silenceThreshold;
console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`);
// Debug logging only if significant changes in audio patterns
if (Math.random() < 0.05) { // Log only 5% of frames to avoid console spam
console.log(`Audio: len=${inputData.length}, energy=${energy.toFixed(4)}, avg=${avgEnergy.toFixed(4)}, silent=${isSilent}`);
}
// Handle speech state based on silence // Handle speech state based on silence
handleSpeechState(isSilent); handleSpeechState(isSilent);
@@ -319,7 +339,6 @@ function handleAudioProcess(event) {
// Create a resampled version at 24kHz for the server // Create a resampled version at 24kHz for the server
// Most WebRTC audio is 48kHz, but we want 24kHz for the model // Most WebRTC audio is 48kHz, but we want 24kHz for the model
const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000); const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`);
// Send the audio chunk to the server // Send the audio chunk to the server
sendAudioChunk(resampledData, state.currentSpeaker); sendAudioChunk(resampledData, state.currentSpeaker);
@@ -847,6 +866,206 @@ function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
return result; return result;
} }
// Handle processing status updates
function handleProcessingStatus(data) {
console.log('Processing status update:', data);
// Show processing status in UI
if (data.status === 'generating_audio') {
elements.streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
elements.streamButton.classList.add('processing');
elements.streamButton.classList.remove('recording');
// Show message to user
addSystemMessage(data.message || 'Processing your request...');
}
}
// Handle the start of an audio streaming response
function handleAudioResponseStart(data) {
console.log('Audio response starting:', data);
// Reset streaming audio state
streamingAudio.chunks = [];
streamingAudio.totalChunks = data.total_chunks;
streamingAudio.receivedChunks = 0;
streamingAudio.text = data.text;
streamingAudio.complete = false;
// Create message container now, so we can update it as chunks arrive
const messageElement = document.createElement('div');
messageElement.className = 'message ai processing';
// Add text content if available
if (data.text) {
const textElement = document.createElement('p');
textElement.textContent = data.text;
messageElement.appendChild(textElement);
}
// Create audio element (will be populated as chunks arrive)
const audioElement = document.createElement('audio');
audioElement.controls = true;
audioElement.className = 'audio-player';
audioElement.textContent = 'Audio is being generated...';
messageElement.appendChild(audioElement);
// Add timestamp
const timeElement = document.createElement('span');
timeElement.className = 'message-time';
timeElement.textContent = new Date().toLocaleTimeString();
messageElement.appendChild(timeElement);
// Add loading indicator
const loadingElement = document.createElement('div');
loadingElement.className = 'loading-indicator';
loadingElement.innerHTML = '<div class="loading-spinner"></div><span>Generating audio response...</span>';
messageElement.appendChild(loadingElement);
// Add to conversation
elements.conversation.appendChild(messageElement);
// Auto-scroll to bottom
elements.conversation.scrollTop = elements.conversation.scrollHeight;
// Store elements for later updates
streamingAudio.messageElement = messageElement;
streamingAudio.audioElement = audioElement;
}
// Handle an incoming audio chunk
function handleAudioResponseChunk(data) {
console.log(`Received audio chunk ${data.chunk_index + 1}/${data.total_chunks}`);
// Store the chunk
streamingAudio.chunks[data.chunk_index] = data.audio;
streamingAudio.receivedChunks++;
// Update progress in the UI
if (streamingAudio.messageElement) {
const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator span');
if (loadingElement) {
loadingElement.textContent = `Generating audio response... ${Math.round((streamingAudio.receivedChunks / data.total_chunks) * 100)}%`;
}
}
// If this is the first chunk, start playing it immediately for faster response
if (data.chunk_index === 0 && streamingAudio.audioElement && elements.autoPlayResponses && elements.autoPlayResponses.checked) {
try {
streamingAudio.audioElement.src = data.audio;
streamingAudio.audioElement.play().catch(err => console.warn('Auto-play failed:', err));
} catch (e) {
console.error('Error playing first chunk:', e);
}
}
// If this is the last chunk or we've received all chunks, finalize the audio
if (data.is_last || streamingAudio.receivedChunks >= data.total_chunks) {
finalizeStreamingAudio();
}
}
// Handle completion of audio streaming
function handleAudioResponseComplete(data) {
console.log('Audio response complete:', data);
streamingAudio.complete = true;
// Make sure we finalize the audio even if some chunks were missed
finalizeStreamingAudio();
// Update UI to normal state
if (state.isStreaming) {
elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
elements.streamButton.classList.add('recording');
elements.streamButton.classList.remove('processing');
}
}
// Finalize streaming audio by combining chunks and updating the UI
function finalizeStreamingAudio() {
if (!streamingAudio.messageElement || streamingAudio.chunks.length === 0) {
return;
}
try {
// For more sophisticated audio streaming, you would need to properly concatenate
// the WAV files, but for now we'll use the last chunk as the complete audio
// since it should contain the entire response due to how the server is implementing it
const lastChunkIndex = streamingAudio.chunks.length - 1;
const audioData = streamingAudio.chunks[lastChunkIndex] || streamingAudio.chunks[0];
// Update the audio element with the complete audio
if (streamingAudio.audioElement) {
streamingAudio.audioElement.src = audioData;
// Auto-play if enabled and not already playing
if (elements.autoPlayResponses && elements.autoPlayResponses.checked &&
streamingAudio.audioElement.paused) {
streamingAudio.audioElement.play()
.catch(err => {
console.warn('Auto-play failed:', err);
addSystemMessage('Auto-play failed. Please click play to hear the response.');
});
}
}
// Remove loading indicator and processing class
if (streamingAudio.messageElement) {
const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator');
if (loadingElement) {
streamingAudio.messageElement.removeChild(loadingElement);
}
streamingAudio.messageElement.classList.remove('processing');
}
console.log('Audio response finalized and ready for playback');
} catch (e) {
console.error('Error finalizing streaming audio:', e);
}
// Reset streaming audio state
streamingAudio.chunks = [];
streamingAudio.totalChunks = 0;
streamingAudio.receivedChunks = 0;
streamingAudio.messageElement = null;
streamingAudio.audioElement = null;
}
// Add CSS styles for new UI elements
document.addEventListener('DOMContentLoaded', function() {
// Add styles for processing state
const style = document.createElement('style');
style.textContent = `
.message.processing {
opacity: 0.8;
}
.loading-indicator {
display: flex;
align-items: center;
margin-top: 8px;
font-size: 0.9em;
color: #666;
}
.loading-spinner {
width: 16px;
height: 16px;
border: 2px solid #ddd;
border-top: 2px solid var(--primary-color);
border-radius: 50%;
margin-right: 8px;
animation: spin 1s linear infinite;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
`;
document.head.appendChild(style);
});
// Initialize the application when DOM is fully loaded // Initialize the application when DOM is fully loaded
document.addEventListener('DOMContentLoaded', initializeApp); document.addEventListener('DOMContentLoaded', initializeApp);

View File

@@ -69,6 +69,25 @@ function CallPage() {
audio.play(); audio.play();
}; };
const handleEmergency = async () => {
// send texts
const response = await fetch("/api/sendMessage", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
message: `yo i need help`,
}),
});
if (!response.ok) {
console.error("Error sending message:", response.statusText);
return;
}
}
return ( return (
<div className="grid grid-rows-[20px_1fr_20px] items-center justify-items-center min-h-screen p-8 pb-20 gap-16 sm:p-20 font-[family-name:var(--font-geist-sans)]"> <div className="grid grid-rows-[20px_1fr_20px] items-center justify-items-center min-h-screen p-8 pb-20 gap-16 sm:p-20 font-[family-name:var(--font-geist-sans)]">
<main className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start"> <main className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start">
@@ -94,7 +113,7 @@ function CallPage() {
</> </>
)} )}
<button className="bg-red-500 text-white rounded-md p-2">Emergency</button> <button onClick={handleEmergency} className="bg-red-500 text-white rounded-md p-2">Emergency</button>
<button className="bg-blue-500 text-white rounded-md p-2" <button className="bg-blue-500 text-white rounded-md p-2"
onClick={() => { onClick={() => {
window.location.href = '/'; window.location.href = '/';

View File

@@ -1,8 +1,14 @@
"use client"; "use client";
import { useState } from "react"; import { useState } from "react";
import { auth0 } from "../lib/auth0"; import { auth0 } from "../lib/auth0";
import { NextApiRequest, NextApiResponse } from "next";
export default async function Home() { export default async function Home() {
const [contacts, setContacts] = useState<string[]>([]); const [contacts, setContacts] = useState<string[]>([]);
const [codeword, setCodeword] = useState(""); const [codeword, setCodeword] = useState("");
@@ -10,23 +16,6 @@ export default async function Home() {
console.log("Session:", session?.user); console.log("Session:", session?.user);
const handleEmergency = async () => {
// send texts
const response = await fetch("/api/sendMessage", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
message: `yo i need help`,
}),
});
if (!response.ok) {
console.error("Error sending message:", response.statusText);
return;
}
}
// If no session, show sign-up and login buttons // If no session, show sign-up and login buttons
if (!session) { if (!session) {