Demo Update 9
This commit is contained in:
@@ -308,8 +308,8 @@ def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str:
|
|||||||
temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav")
|
temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav")
|
||||||
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
|
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
|
||||||
|
|
||||||
# Perform speech recognition
|
# Perform speech recognition - using input_features instead of inputs
|
||||||
result = speech_recognizer(temp_path)
|
result = speech_recognizer(temp_path, input_features=None) # input_features=None forces use of the correct parameter name
|
||||||
transcription = result["text"]
|
transcription = result["text"]
|
||||||
|
|
||||||
# Clean up temp file
|
# Clean up temp file
|
||||||
@@ -650,7 +650,7 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
|
|||||||
# Combine audio chunks
|
# Combine audio chunks
|
||||||
full_audio = torch.cat(client['streaming_buffer'], dim=0)
|
full_audio = torch.cat(client['streaming_buffer'], dim=0)
|
||||||
|
|
||||||
# Process audio to generate a response (no speech recognition)
|
# Process audio to generate a response (using speech recognition)
|
||||||
generated_text = process_speech(full_audio, client_id)
|
generated_text = process_speech(full_audio, client_id)
|
||||||
|
|
||||||
# Add suffix for incomplete utterances
|
# Add suffix for incomplete utterances
|
||||||
@@ -706,15 +706,27 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
|
|||||||
)
|
)
|
||||||
client['context_segments'].append(ai_segment)
|
client['context_segments'].append(ai_segment)
|
||||||
|
|
||||||
# Convert audio to base64 and send back to client
|
# CHANGE HERE: Use the streaming function instead of sending all at once
|
||||||
audio_base64 = encode_audio_data(audio_tensor)
|
# Check if the audio is short enough to send at once or if it should be streamed
|
||||||
emit('audio_response', {
|
if audio_tensor.size(0) < generator.sample_rate * 2: # Less than 2 seconds
|
||||||
'type': 'audio_response',
|
# For short responses, just send in one go for better responsiveness
|
||||||
'text': response_text,
|
audio_base64 = encode_audio_data(audio_tensor)
|
||||||
'audio': audio_base64
|
emit('audio_response', {
|
||||||
}, room=client_id)
|
'type': 'audio_response',
|
||||||
|
'text': response_text,
|
||||||
logger.info(f"[{client_id[:8]}] Audio response sent")
|
'audio': audio_base64
|
||||||
|
}, room=client_id)
|
||||||
|
logger.info(f"[{client_id[:8]}] Short audio response sent in one piece")
|
||||||
|
else:
|
||||||
|
# For longer responses, use streaming
|
||||||
|
logger.info(f"[{client_id[:8]}] Using streaming for audio response")
|
||||||
|
# Start a new thread for streaming to avoid blocking the main thread
|
||||||
|
import threading
|
||||||
|
stream_thread = threading.Thread(
|
||||||
|
target=stream_audio_to_client,
|
||||||
|
args=(client_id, audio_tensor, response_text, ai_speaker_id)
|
||||||
|
)
|
||||||
|
stream_thread.start()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error generating audio response: {e}")
|
logger.error(f"Error generating audio response: {e}")
|
||||||
|
|||||||
@@ -50,6 +50,20 @@ let canvasContext = null;
|
|||||||
let visualizerBufferLength = 0;
|
let visualizerBufferLength = 0;
|
||||||
let visualizerDataArray = null;
|
let visualizerDataArray = null;
|
||||||
|
|
||||||
|
// New state variables to track incremental audio streaming
|
||||||
|
const streamingAudio = {
|
||||||
|
messageElement: null,
|
||||||
|
audioElement: null,
|
||||||
|
chunks: [],
|
||||||
|
totalChunks: 0,
|
||||||
|
receivedChunks: 0,
|
||||||
|
text: '',
|
||||||
|
mediaSource: null,
|
||||||
|
sourceBuffer: null,
|
||||||
|
audioContext: null,
|
||||||
|
complete: false
|
||||||
|
};
|
||||||
|
|
||||||
// Initialize the application
|
// Initialize the application
|
||||||
function initializeApp() {
|
function initializeApp() {
|
||||||
// Initialize the UI elements
|
// Initialize the UI elements
|
||||||
@@ -116,6 +130,12 @@ function setupSocketConnection() {
|
|||||||
state.socket.on('transcription', handleTranscription);
|
state.socket.on('transcription', handleTranscription);
|
||||||
state.socket.on('context_updated', handleContextUpdate);
|
state.socket.on('context_updated', handleContextUpdate);
|
||||||
state.socket.on('streaming_status', handleStreamingStatus);
|
state.socket.on('streaming_status', handleStreamingStatus);
|
||||||
|
|
||||||
|
// New event handlers for incremental audio streaming
|
||||||
|
state.socket.on('audio_response_start', handleAudioResponseStart);
|
||||||
|
state.socket.on('audio_response_chunk', handleAudioResponseChunk);
|
||||||
|
state.socket.on('audio_response_complete', handleAudioResponseComplete);
|
||||||
|
state.socket.on('processing_status', handleProcessingStatus);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup event listeners
|
// Setup event listeners
|
||||||
@@ -294,12 +314,8 @@ function stopStreaming(notifyServer = true) {
|
|||||||
function handleAudioProcess(event) {
|
function handleAudioProcess(event) {
|
||||||
const inputData = event.inputBuffer.getChannelData(0);
|
const inputData = event.inputBuffer.getChannelData(0);
|
||||||
|
|
||||||
// Log audio buffer statistics
|
|
||||||
console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`);
|
|
||||||
|
|
||||||
// Calculate audio energy (volume level)
|
// Calculate audio energy (volume level)
|
||||||
const energy = calculateAudioEnergy(inputData);
|
const energy = calculateAudioEnergy(inputData);
|
||||||
console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`);
|
|
||||||
|
|
||||||
// Update energy window for averaging
|
// Update energy window for averaging
|
||||||
updateEnergyWindow(energy);
|
updateEnergyWindow(energy);
|
||||||
@@ -309,7 +325,11 @@ function handleAudioProcess(event) {
|
|||||||
|
|
||||||
// Determine if audio is silent
|
// Determine if audio is silent
|
||||||
const isSilent = avgEnergy < state.silenceThreshold;
|
const isSilent = avgEnergy < state.silenceThreshold;
|
||||||
console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`);
|
|
||||||
|
// Debug logging only if significant changes in audio patterns
|
||||||
|
if (Math.random() < 0.05) { // Log only 5% of frames to avoid console spam
|
||||||
|
console.log(`Audio: len=${inputData.length}, energy=${energy.toFixed(4)}, avg=${avgEnergy.toFixed(4)}, silent=${isSilent}`);
|
||||||
|
}
|
||||||
|
|
||||||
// Handle speech state based on silence
|
// Handle speech state based on silence
|
||||||
handleSpeechState(isSilent);
|
handleSpeechState(isSilent);
|
||||||
@@ -319,7 +339,6 @@ function handleAudioProcess(event) {
|
|||||||
// Create a resampled version at 24kHz for the server
|
// Create a resampled version at 24kHz for the server
|
||||||
// Most WebRTC audio is 48kHz, but we want 24kHz for the model
|
// Most WebRTC audio is 48kHz, but we want 24kHz for the model
|
||||||
const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
|
const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
|
||||||
console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`);
|
|
||||||
|
|
||||||
// Send the audio chunk to the server
|
// Send the audio chunk to the server
|
||||||
sendAudioChunk(resampledData, state.currentSpeaker);
|
sendAudioChunk(resampledData, state.currentSpeaker);
|
||||||
@@ -847,6 +866,206 @@ function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle processing status updates
|
||||||
|
function handleProcessingStatus(data) {
|
||||||
|
console.log('Processing status update:', data);
|
||||||
|
|
||||||
|
// Show processing status in UI
|
||||||
|
if (data.status === 'generating_audio') {
|
||||||
|
elements.streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
|
||||||
|
elements.streamButton.classList.add('processing');
|
||||||
|
elements.streamButton.classList.remove('recording');
|
||||||
|
|
||||||
|
// Show message to user
|
||||||
|
addSystemMessage(data.message || 'Processing your request...');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the start of an audio streaming response
|
||||||
|
function handleAudioResponseStart(data) {
|
||||||
|
console.log('Audio response starting:', data);
|
||||||
|
|
||||||
|
// Reset streaming audio state
|
||||||
|
streamingAudio.chunks = [];
|
||||||
|
streamingAudio.totalChunks = data.total_chunks;
|
||||||
|
streamingAudio.receivedChunks = 0;
|
||||||
|
streamingAudio.text = data.text;
|
||||||
|
streamingAudio.complete = false;
|
||||||
|
|
||||||
|
// Create message container now, so we can update it as chunks arrive
|
||||||
|
const messageElement = document.createElement('div');
|
||||||
|
messageElement.className = 'message ai processing';
|
||||||
|
|
||||||
|
// Add text content if available
|
||||||
|
if (data.text) {
|
||||||
|
const textElement = document.createElement('p');
|
||||||
|
textElement.textContent = data.text;
|
||||||
|
messageElement.appendChild(textElement);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create audio element (will be populated as chunks arrive)
|
||||||
|
const audioElement = document.createElement('audio');
|
||||||
|
audioElement.controls = true;
|
||||||
|
audioElement.className = 'audio-player';
|
||||||
|
audioElement.textContent = 'Audio is being generated...';
|
||||||
|
messageElement.appendChild(audioElement);
|
||||||
|
|
||||||
|
// Add timestamp
|
||||||
|
const timeElement = document.createElement('span');
|
||||||
|
timeElement.className = 'message-time';
|
||||||
|
timeElement.textContent = new Date().toLocaleTimeString();
|
||||||
|
messageElement.appendChild(timeElement);
|
||||||
|
|
||||||
|
// Add loading indicator
|
||||||
|
const loadingElement = document.createElement('div');
|
||||||
|
loadingElement.className = 'loading-indicator';
|
||||||
|
loadingElement.innerHTML = '<div class="loading-spinner"></div><span>Generating audio response...</span>';
|
||||||
|
messageElement.appendChild(loadingElement);
|
||||||
|
|
||||||
|
// Add to conversation
|
||||||
|
elements.conversation.appendChild(messageElement);
|
||||||
|
|
||||||
|
// Auto-scroll to bottom
|
||||||
|
elements.conversation.scrollTop = elements.conversation.scrollHeight;
|
||||||
|
|
||||||
|
// Store elements for later updates
|
||||||
|
streamingAudio.messageElement = messageElement;
|
||||||
|
streamingAudio.audioElement = audioElement;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle an incoming audio chunk
|
||||||
|
function handleAudioResponseChunk(data) {
|
||||||
|
console.log(`Received audio chunk ${data.chunk_index + 1}/${data.total_chunks}`);
|
||||||
|
|
||||||
|
// Store the chunk
|
||||||
|
streamingAudio.chunks[data.chunk_index] = data.audio;
|
||||||
|
streamingAudio.receivedChunks++;
|
||||||
|
|
||||||
|
// Update progress in the UI
|
||||||
|
if (streamingAudio.messageElement) {
|
||||||
|
const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator span');
|
||||||
|
if (loadingElement) {
|
||||||
|
loadingElement.textContent = `Generating audio response... ${Math.round((streamingAudio.receivedChunks / data.total_chunks) * 100)}%`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If this is the first chunk, start playing it immediately for faster response
|
||||||
|
if (data.chunk_index === 0 && streamingAudio.audioElement && elements.autoPlayResponses && elements.autoPlayResponses.checked) {
|
||||||
|
try {
|
||||||
|
streamingAudio.audioElement.src = data.audio;
|
||||||
|
streamingAudio.audioElement.play().catch(err => console.warn('Auto-play failed:', err));
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error playing first chunk:', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If this is the last chunk or we've received all chunks, finalize the audio
|
||||||
|
if (data.is_last || streamingAudio.receivedChunks >= data.total_chunks) {
|
||||||
|
finalizeStreamingAudio();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle completion of audio streaming
|
||||||
|
function handleAudioResponseComplete(data) {
|
||||||
|
console.log('Audio response complete:', data);
|
||||||
|
streamingAudio.complete = true;
|
||||||
|
|
||||||
|
// Make sure we finalize the audio even if some chunks were missed
|
||||||
|
finalizeStreamingAudio();
|
||||||
|
|
||||||
|
// Update UI to normal state
|
||||||
|
if (state.isStreaming) {
|
||||||
|
elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
|
||||||
|
elements.streamButton.classList.add('recording');
|
||||||
|
elements.streamButton.classList.remove('processing');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finalize streaming audio by combining chunks and updating the UI
|
||||||
|
function finalizeStreamingAudio() {
|
||||||
|
if (!streamingAudio.messageElement || streamingAudio.chunks.length === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// For more sophisticated audio streaming, you would need to properly concatenate
|
||||||
|
// the WAV files, but for now we'll use the last chunk as the complete audio
|
||||||
|
// since it should contain the entire response due to how the server is implementing it
|
||||||
|
const lastChunkIndex = streamingAudio.chunks.length - 1;
|
||||||
|
const audioData = streamingAudio.chunks[lastChunkIndex] || streamingAudio.chunks[0];
|
||||||
|
|
||||||
|
// Update the audio element with the complete audio
|
||||||
|
if (streamingAudio.audioElement) {
|
||||||
|
streamingAudio.audioElement.src = audioData;
|
||||||
|
|
||||||
|
// Auto-play if enabled and not already playing
|
||||||
|
if (elements.autoPlayResponses && elements.autoPlayResponses.checked &&
|
||||||
|
streamingAudio.audioElement.paused) {
|
||||||
|
streamingAudio.audioElement.play()
|
||||||
|
.catch(err => {
|
||||||
|
console.warn('Auto-play failed:', err);
|
||||||
|
addSystemMessage('Auto-play failed. Please click play to hear the response.');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove loading indicator and processing class
|
||||||
|
if (streamingAudio.messageElement) {
|
||||||
|
const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator');
|
||||||
|
if (loadingElement) {
|
||||||
|
streamingAudio.messageElement.removeChild(loadingElement);
|
||||||
|
}
|
||||||
|
streamingAudio.messageElement.classList.remove('processing');
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('Audio response finalized and ready for playback');
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error finalizing streaming audio:', e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset streaming audio state
|
||||||
|
streamingAudio.chunks = [];
|
||||||
|
streamingAudio.totalChunks = 0;
|
||||||
|
streamingAudio.receivedChunks = 0;
|
||||||
|
streamingAudio.messageElement = null;
|
||||||
|
streamingAudio.audioElement = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add CSS styles for new UI elements
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
// Add styles for processing state
|
||||||
|
const style = document.createElement('style');
|
||||||
|
style.textContent = `
|
||||||
|
.message.processing {
|
||||||
|
opacity: 0.8;
|
||||||
|
}
|
||||||
|
|
||||||
|
.loading-indicator {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
margin-top: 8px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
color: #666;
|
||||||
|
}
|
||||||
|
|
||||||
|
.loading-spinner {
|
||||||
|
width: 16px;
|
||||||
|
height: 16px;
|
||||||
|
border: 2px solid #ddd;
|
||||||
|
border-top: 2px solid var(--primary-color);
|
||||||
|
border-radius: 50%;
|
||||||
|
margin-right: 8px;
|
||||||
|
animation: spin 1s linear infinite;
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes spin {
|
||||||
|
0% { transform: rotate(0deg); }
|
||||||
|
100% { transform: rotate(360deg); }
|
||||||
|
}
|
||||||
|
`;
|
||||||
|
document.head.appendChild(style);
|
||||||
|
});
|
||||||
|
|
||||||
// Initialize the application when DOM is fully loaded
|
// Initialize the application when DOM is fully loaded
|
||||||
document.addEventListener('DOMContentLoaded', initializeApp);
|
document.addEventListener('DOMContentLoaded', initializeApp);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user