Merge branch 'main' of https://github.com/GamerBoss101/HooHacks-12
This commit is contained in:
@@ -87,6 +87,7 @@ class Conversation:
|
|||||||
self.session_id = session_id
|
self.session_id = session_id
|
||||||
self.segments: List[Segment] = []
|
self.segments: List[Segment] = []
|
||||||
self.current_speaker = 0
|
self.current_speaker = 0
|
||||||
|
self.ai_speaker_id = 1 # Add this property
|
||||||
self.last_activity = time.time()
|
self.last_activity = time.time()
|
||||||
self.is_processing = False
|
self.is_processing = False
|
||||||
|
|
||||||
@@ -209,7 +210,9 @@ def process_audio_queue(session_id, q):
|
|||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing audio for {session_id}: {str(e)}")
|
logger.error(f"Error processing audio for {session_id}: {str(e)}")
|
||||||
socketio.emit('error', {'message': str(e)}, room=session_id)
|
# Create an app context for the socket emit
|
||||||
|
with app.app_context():
|
||||||
|
socketio.emit('error', {'message': str(e)}, room=session_id)
|
||||||
finally:
|
finally:
|
||||||
logger.info(f"Ending processing thread for session: {session_id}")
|
logger.info(f"Ending processing thread for session: {session_id}")
|
||||||
# Clean up when thread is done
|
# Clean up when thread is done
|
||||||
@@ -222,7 +225,8 @@ def process_audio_queue(session_id, q):
|
|||||||
def process_audio_and_respond(session_id, data):
|
def process_audio_and_respond(session_id, data):
|
||||||
"""Process audio data and generate a response"""
|
"""Process audio data and generate a response"""
|
||||||
if models.generator is None or models.asr is None or models.llm is None:
|
if models.generator is None or models.asr is None or models.llm is None:
|
||||||
socketio.emit('error', {'message': 'Models still loading, please wait'}, room=session_id)
|
with app.app_context():
|
||||||
|
socketio.emit('error', {'message': 'Models still loading, please wait'}, room=session_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
conversation = active_conversations[session_id]
|
conversation = active_conversations[session_id]
|
||||||
@@ -260,7 +264,8 @@ def process_audio_and_respond(session_id, data):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Transcribe audio
|
# Transcribe audio
|
||||||
socketio.emit('processing_status', {'status': 'transcribing'}, room=session_id)
|
with app.app_context():
|
||||||
|
socketio.emit('processing_status', {'status': 'transcribing'}, room=session_id)
|
||||||
|
|
||||||
# Use the ASR pipeline to transcribe
|
# Use the ASR pipeline to transcribe
|
||||||
transcription_result = models.asr(
|
transcription_result = models.asr(
|
||||||
@@ -271,7 +276,8 @@ def process_audio_and_respond(session_id, data):
|
|||||||
|
|
||||||
# If no text was recognized, don't process further
|
# If no text was recognized, don't process further
|
||||||
if not user_text:
|
if not user_text:
|
||||||
socketio.emit('error', {'message': 'No speech detected'}, room=session_id)
|
with app.app_context():
|
||||||
|
socketio.emit('error', {'message': 'No speech detected'}, room=session_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Add the user's message to conversation history
|
# Add the user's message to conversation history
|
||||||
@@ -282,13 +288,15 @@ def process_audio_and_respond(session_id, data):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Send transcription to client
|
# Send transcription to client
|
||||||
socketio.emit('transcription', {
|
with app.app_context():
|
||||||
'text': user_text,
|
socketio.emit('transcription', {
|
||||||
'speaker': speaker_id
|
'text': user_text,
|
||||||
}, room=session_id)
|
'speaker': speaker_id
|
||||||
|
}, room=session_id)
|
||||||
|
|
||||||
# Generate AI response using Llama
|
# Generate AI response using Llama
|
||||||
socketio.emit('processing_status', {'status': 'generating'}, room=session_id)
|
with app.app_context():
|
||||||
|
socketio.emit('processing_status', {'status': 'generating'}, room=session_id)
|
||||||
|
|
||||||
# Create prompt from conversation history
|
# Create prompt from conversation history
|
||||||
conversation_history = ""
|
conversation_history = ""
|
||||||
@@ -319,22 +327,23 @@ def process_audio_and_respond(session_id, data):
|
|||||||
).strip()
|
).strip()
|
||||||
|
|
||||||
# Synthesize speech
|
# Synthesize speech
|
||||||
socketio.emit('processing_status', {'status': 'synthesizing'}, room=session_id)
|
with app.app_context():
|
||||||
|
socketio.emit('processing_status', {'status': 'synthesizing'}, room=session_id)
|
||||||
|
|
||||||
|
# Start sending the audio response
|
||||||
|
socketio.emit('audio_response_start', {
|
||||||
|
'text': response_text,
|
||||||
|
'total_chunks': 1,
|
||||||
|
'chunk_index': 0
|
||||||
|
}, room=session_id)
|
||||||
|
|
||||||
# Generate audio with CSM
|
# Define AI speaker ID (use a consistent value for the AI's voice)
|
||||||
ai_speaker_id = 1 # Use speaker 1 for AI responses
|
ai_speaker_id = 1 # Use speaker 1 for AI responses
|
||||||
|
|
||||||
# Start sending the audio response
|
|
||||||
socketio.emit('audio_response_start', {
|
|
||||||
'text': response_text,
|
|
||||||
'total_chunks': 1,
|
|
||||||
'chunk_index': 0
|
|
||||||
}, room=session_id)
|
|
||||||
|
|
||||||
# Generate audio
|
# Generate audio
|
||||||
audio_tensor = models.generator.generate(
|
audio_tensor = models.generator.generate(
|
||||||
text=response_text,
|
text=response_text,
|
||||||
speaker=ai_speaker_id,
|
speaker=ai_speaker_id, # Use the local variable instead of conversation.ai_speaker_id
|
||||||
context=conversation.get_context(),
|
context=conversation.get_context(),
|
||||||
max_audio_length_ms=10_000,
|
max_audio_length_ms=10_000,
|
||||||
temperature=0.9
|
temperature=0.9
|
||||||
@@ -343,7 +352,7 @@ def process_audio_and_respond(session_id, data):
|
|||||||
# Add AI response to conversation history
|
# Add AI response to conversation history
|
||||||
ai_segment = conversation.add_segment(
|
ai_segment = conversation.add_segment(
|
||||||
text=response_text,
|
text=response_text,
|
||||||
speaker=ai_speaker_id,
|
speaker=ai_speaker_id, # Also use the local variable here
|
||||||
audio=audio_tensor
|
audio=audio_tensor
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -362,17 +371,18 @@ def process_audio_and_respond(session_id, data):
|
|||||||
audio_base64 = f"data:audio/wav;base64,{base64.b64encode(wav_data).decode('utf-8')}"
|
audio_base64 = f"data:audio/wav;base64,{base64.b64encode(wav_data).decode('utf-8')}"
|
||||||
|
|
||||||
# Send audio chunk to client
|
# Send audio chunk to client
|
||||||
socketio.emit('audio_response_chunk', {
|
with app.app_context():
|
||||||
'chunk': audio_base64,
|
socketio.emit('audio_response_chunk', {
|
||||||
'chunk_index': 0,
|
'chunk': audio_base64,
|
||||||
'total_chunks': 1,
|
'chunk_index': 0,
|
||||||
'is_last': True
|
'total_chunks': 1,
|
||||||
}, room=session_id)
|
'is_last': True
|
||||||
|
}, room=session_id)
|
||||||
# Signal completion
|
|
||||||
socketio.emit('audio_response_complete', {
|
# Signal completion
|
||||||
'text': response_text
|
socketio.emit('audio_response_complete', {
|
||||||
}, room=session_id)
|
'text': response_text
|
||||||
|
}, room=session_id)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Clean up temp file
|
# Clean up temp file
|
||||||
@@ -381,7 +391,8 @@ def process_audio_and_respond(session_id, data):
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing audio: {str(e)}")
|
logger.error(f"Error processing audio: {str(e)}")
|
||||||
socketio.emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
|
with app.app_context():
|
||||||
|
socketio.emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
|
||||||
finally:
|
finally:
|
||||||
# Reset processing flag
|
# Reset processing flag
|
||||||
conversation.is_processing = False
|
conversation.is_processing = False
|
||||||
|
|||||||
@@ -42,7 +42,8 @@ const state = {
|
|||||||
silenceTimer: null,
|
silenceTimer: null,
|
||||||
volumeUpdateInterval: null,
|
volumeUpdateInterval: null,
|
||||||
visualizerAnimationFrame: null,
|
visualizerAnimationFrame: null,
|
||||||
currentSpeaker: 0
|
currentSpeaker: 0,
|
||||||
|
aiSpeakerId: 1 // Define the AI's speaker ID to match server.py
|
||||||
};
|
};
|
||||||
|
|
||||||
// Visualizer variables
|
// Visualizer variables
|
||||||
@@ -674,10 +675,14 @@ function handleAudioResponseChunk(data) {
|
|||||||
streamingAudio.chunks[data.chunk_index] = data.chunk;
|
streamingAudio.chunks[data.chunk_index] = data.chunk;
|
||||||
streamingAudio.receivedChunks++;
|
streamingAudio.receivedChunks++;
|
||||||
|
|
||||||
|
// Store audio element reference for later use
|
||||||
|
streamingAudio.audioElement = audioElement;
|
||||||
|
|
||||||
// Add to the conversation
|
// Add to the conversation
|
||||||
const messages = elements.conversation.querySelectorAll('.message.ai');
|
const messages = elements.conversation.querySelectorAll('.message.ai');
|
||||||
if (messages.length > 0) {
|
if (messages.length > 0) {
|
||||||
const lastAiMessage = messages[messages.length - 1];
|
const lastAiMessage = messages[messages.length - 1];
|
||||||
|
streamingAudio.messageElement = lastAiMessage;
|
||||||
|
|
||||||
// Replace existing audio player if there is one
|
// Replace existing audio player if there is one
|
||||||
const existingPlayer = lastAiMessage.querySelector('.audio-player');
|
const existingPlayer = lastAiMessage.querySelector('.audio-player');
|
||||||
@@ -690,6 +695,7 @@ function handleAudioResponseChunk(data) {
|
|||||||
// Create a new message for the AI response
|
// Create a new message for the AI response
|
||||||
const aiMessage = document.createElement('div');
|
const aiMessage = document.createElement('div');
|
||||||
aiMessage.className = 'message ai';
|
aiMessage.className = 'message ai';
|
||||||
|
streamingAudio.messageElement = aiMessage;
|
||||||
|
|
||||||
if (streamingAudio.text) {
|
if (streamingAudio.text) {
|
||||||
const textElement = document.createElement('p');
|
const textElement = document.createElement('p');
|
||||||
|
|||||||
Reference in New Issue
Block a user