This commit is contained in:
BGV
2025-03-30 02:23:33 -04:00
2 changed files with 50 additions and 33 deletions

View File

@@ -87,6 +87,7 @@ class Conversation:
self.session_id = session_id
self.segments: List[Segment] = []
self.current_speaker = 0
self.ai_speaker_id = 1 # Add this property
self.last_activity = time.time()
self.is_processing = False
@@ -209,6 +210,8 @@ def process_audio_queue(session_id, q):
continue
except Exception as e:
logger.error(f"Error processing audio for {session_id}: {str(e)}")
# Create an app context for the socket emit
with app.app_context():
socketio.emit('error', {'message': str(e)}, room=session_id)
finally:
logger.info(f"Ending processing thread for session: {session_id}")
@@ -222,6 +225,7 @@ def process_audio_queue(session_id, q):
def process_audio_and_respond(session_id, data):
"""Process audio data and generate a response"""
if models.generator is None or models.asr is None or models.llm is None:
with app.app_context():
socketio.emit('error', {'message': 'Models still loading, please wait'}, room=session_id)
return
@@ -260,6 +264,7 @@ def process_audio_and_respond(session_id, data):
)
# Transcribe audio
with app.app_context():
socketio.emit('processing_status', {'status': 'transcribing'}, room=session_id)
# Use the ASR pipeline to transcribe
@@ -271,6 +276,7 @@ def process_audio_and_respond(session_id, data):
# If no text was recognized, don't process further
if not user_text:
with app.app_context():
socketio.emit('error', {'message': 'No speech detected'}, room=session_id)
return
@@ -282,12 +288,14 @@ def process_audio_and_respond(session_id, data):
)
# Send transcription to client
with app.app_context():
socketio.emit('transcription', {
'text': user_text,
'speaker': speaker_id
}, room=session_id)
# Generate AI response using Llama
with app.app_context():
socketio.emit('processing_status', {'status': 'generating'}, room=session_id)
# Create prompt from conversation history
@@ -319,11 +327,9 @@ def process_audio_and_respond(session_id, data):
).strip()
# Synthesize speech
with app.app_context():
socketio.emit('processing_status', {'status': 'synthesizing'}, room=session_id)
# Generate audio with CSM
ai_speaker_id = 1 # Use speaker 1 for AI responses
# Start sending the audio response
socketio.emit('audio_response_start', {
'text': response_text,
@@ -331,10 +337,13 @@ def process_audio_and_respond(session_id, data):
'chunk_index': 0
}, room=session_id)
# Define AI speaker ID (use a consistent value for the AI's voice)
ai_speaker_id = 1 # Use speaker 1 for AI responses
# Generate audio
audio_tensor = models.generator.generate(
text=response_text,
speaker=ai_speaker_id,
speaker=ai_speaker_id, # Use the local variable instead of conversation.ai_speaker_id
context=conversation.get_context(),
max_audio_length_ms=10_000,
temperature=0.9
@@ -343,7 +352,7 @@ def process_audio_and_respond(session_id, data):
# Add AI response to conversation history
ai_segment = conversation.add_segment(
text=response_text,
speaker=ai_speaker_id,
speaker=ai_speaker_id, # Also use the local variable here
audio=audio_tensor
)
@@ -362,6 +371,7 @@ def process_audio_and_respond(session_id, data):
audio_base64 = f"data:audio/wav;base64,{base64.b64encode(wav_data).decode('utf-8')}"
# Send audio chunk to client
with app.app_context():
socketio.emit('audio_response_chunk', {
'chunk': audio_base64,
'chunk_index': 0,
@@ -381,6 +391,7 @@ def process_audio_and_respond(session_id, data):
except Exception as e:
logger.error(f"Error processing audio: {str(e)}")
with app.app_context():
socketio.emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
finally:
# Reset processing flag

View File

@@ -42,7 +42,8 @@ const state = {
silenceTimer: null,
volumeUpdateInterval: null,
visualizerAnimationFrame: null,
currentSpeaker: 0
currentSpeaker: 0,
aiSpeakerId: 1 // Define the AI's speaker ID to match server.py
};
// Visualizer variables
@@ -674,10 +675,14 @@ function handleAudioResponseChunk(data) {
streamingAudio.chunks[data.chunk_index] = data.chunk;
streamingAudio.receivedChunks++;
// Store audio element reference for later use
streamingAudio.audioElement = audioElement;
// Add to the conversation
const messages = elements.conversation.querySelectorAll('.message.ai');
if (messages.length > 0) {
const lastAiMessage = messages[messages.length - 1];
streamingAudio.messageElement = lastAiMessage;
// Replace existing audio player if there is one
const existingPlayer = lastAiMessage.querySelector('.audio-player');
@@ -690,6 +695,7 @@ function handleAudioResponseChunk(data) {
// Create a new message for the AI response
const aiMessage = document.createElement('div');
aiMessage.className = 'message ai';
streamingAudio.messageElement = aiMessage;
if (streamingAudio.text) {
const textElement = document.createElement('p');