Demo Update 9

This commit is contained in:
2025-03-30 01:00:03 -04:00
parent 1c5a9431dc
commit 27c5151e62
2 changed files with 250 additions and 19 deletions

View File

@@ -308,8 +308,8 @@ def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str:
temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav")
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
# Perform speech recognition
result = speech_recognizer(temp_path)
# Perform speech recognition - using input_features instead of inputs
result = speech_recognizer(temp_path, input_features=None) # input_features=None forces use of the correct parameter name
transcription = result["text"]
# Clean up temp file
@@ -650,7 +650,7 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
# Combine audio chunks
full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process audio to generate a response (no speech recognition)
# Process audio to generate a response (using speech recognition)
generated_text = process_speech(full_audio, client_id)
# Add suffix for incomplete utterances
@@ -706,16 +706,28 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals
)
client['context_segments'].append(ai_segment)
# Convert audio to base64 and send back to client
audio_base64 = encode_audio_data(audio_tensor)
emit('audio_response', {
'type': 'audio_response',
'text': response_text,
'audio': audio_base64
}, room=client_id)
logger.info(f"[{client_id[:8]}] Audio response sent")
# CHANGE HERE: Use the streaming function instead of sending all at once
# Check if the audio is short enough to send at once or if it should be streamed
if audio_tensor.size(0) < generator.sample_rate * 2: # Less than 2 seconds
# For short responses, just send in one go for better responsiveness
audio_base64 = encode_audio_data(audio_tensor)
emit('audio_response', {
'type': 'audio_response',
'text': response_text,
'audio': audio_base64
}, room=client_id)
logger.info(f"[{client_id[:8]}] Short audio response sent in one piece")
else:
# For longer responses, use streaming
logger.info(f"[{client_id[:8]}] Using streaming for audio response")
# Start a new thread for streaming to avoid blocking the main thread
import threading
stream_thread = threading.Thread(
target=stream_audio_to_client,
args=(client_id, audio_tensor, response_text, ai_speaker_id)
)
stream_thread.start()
except Exception as e:
logger.error(f"Error generating audio response: {e}")
emit('error', {