Merge branch 'main' of https://github.com/GamerBoss101/HooHacks-12

2025-03-30 02:23:33 -04:00
parent 5431e1fa5e 60db42f98e
commit 3ce459f45e
2 changed files with 50 additions and 33 deletions
--- a/Backend/server.py
+++ b/Backend/server.py
@@ -87,6 +87,7 @@ class Conversation:
        self.session_id = session_id
        self.segments: List[Segment] = []
        self.current_speaker = 0
+        self.ai_speaker_id = 1  # Add this property
        self.last_activity = time.time()
        self.is_processing = False
    
@@ -209,6 +210,8 @@ def process_audio_queue(session_id, q):
                continue
            except Exception as e:
                logger.error(f"Error processing audio for {session_id}: {str(e)}")
+                # Create an app context for the socket emit
+                with app.app_context():
                    socketio.emit('error', {'message': str(e)}, room=session_id)
    finally:
        logger.info(f"Ending processing thread for session: {session_id}")
@@ -222,6 +225,7 @@ def process_audio_queue(session_id, q):
 def process_audio_and_respond(session_id, data):
    """Process audio data and generate a response"""
    if models.generator is None or models.asr is None or models.llm is None:
+        with app.app_context():
            socketio.emit('error', {'message': 'Models still loading, please wait'}, room=session_id)
        return
    
@@ -260,6 +264,7 @@ def process_audio_and_respond(session_id, data):
                )
            
            # Transcribe audio
+            with app.app_context():
                socketio.emit('processing_status', {'status': 'transcribing'}, room=session_id)
            
            # Use the ASR pipeline to transcribe
@@ -271,6 +276,7 @@ def process_audio_and_respond(session_id, data):
            
            # If no text was recognized, don't process further
            if not user_text:
+                with app.app_context():
                    socketio.emit('error', {'message': 'No speech detected'}, room=session_id)
                return
            
@@ -282,12 +288,14 @@ def process_audio_and_respond(session_id, data):
            )
            
            # Send transcription to client
+            with app.app_context():
                socketio.emit('transcription', {
                    'text': user_text, 
                    'speaker': speaker_id
                }, room=session_id)
            
            # Generate AI response using Llama
+            with app.app_context():
                socketio.emit('processing_status', {'status': 'generating'}, room=session_id)
            
            # Create prompt from conversation history
@@ -319,11 +327,9 @@ def process_audio_and_respond(session_id, data):
            ).strip()
            
            # Synthesize speech
+            with app.app_context():
                socketio.emit('processing_status', {'status': 'synthesizing'}, room=session_id)
                
-            # Generate audio with CSM
-            ai_speaker_id = 1  # Use speaker 1 for AI responses
-            
                # Start sending the audio response
                socketio.emit('audio_response_start', {
                    'text': response_text,
@@ -331,10 +337,13 @@ def process_audio_and_respond(session_id, data):
                    'chunk_index': 0
                }, room=session_id)
            
+            # Define AI speaker ID (use a consistent value for the AI's voice)
+            ai_speaker_id = 1  # Use speaker 1 for AI responses
+            
            # Generate audio
            audio_tensor = models.generator.generate(
                text=response_text,
-                speaker=ai_speaker_id,
+                speaker=ai_speaker_id,  # Use the local variable instead of conversation.ai_speaker_id
                context=conversation.get_context(),
                max_audio_length_ms=10_000,
                temperature=0.9
@@ -343,7 +352,7 @@ def process_audio_and_respond(session_id, data):
            # Add AI response to conversation history
            ai_segment = conversation.add_segment(
                text=response_text,
-                speaker=ai_speaker_id,
+                speaker=ai_speaker_id,  # Also use the local variable here
                audio=audio_tensor
            )
            
@@ -362,6 +371,7 @@ def process_audio_and_respond(session_id, data):
            audio_base64 = f"data:audio/wav;base64,{base64.b64encode(wav_data).decode('utf-8')}"
            
            # Send audio chunk to client
+            with app.app_context():
                socketio.emit('audio_response_chunk', {
                    'chunk': audio_base64,
                    'chunk_index': 0,
@@ -381,6 +391,7 @@ def process_audio_and_respond(session_id, data):
    
    except Exception as e:
        logger.error(f"Error processing audio: {str(e)}")
+        with app.app_context():
            socketio.emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
    finally:
        # Reset processing flag
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
@@ -42,7 +42,8 @@ const state = {
    silenceTimer: null,
    volumeUpdateInterval: null,
    visualizerAnimationFrame: null,
-    currentSpeaker: 0
+    currentSpeaker: 0,
+    aiSpeakerId: 1  // Define the AI's speaker ID to match server.py
 };

 // Visualizer variables
@@ -674,10 +675,14 @@ function handleAudioResponseChunk(data) {
    streamingAudio.chunks[data.chunk_index] = data.chunk;
    streamingAudio.receivedChunks++;
    
+    // Store audio element reference for later use
+    streamingAudio.audioElement = audioElement;
+    
    // Add to the conversation
    const messages = elements.conversation.querySelectorAll('.message.ai');
    if (messages.length > 0) {
        const lastAiMessage = messages[messages.length - 1];
+        streamingAudio.messageElement = lastAiMessage;
        
        // Replace existing audio player if there is one
        const existingPlayer = lastAiMessage.querySelector('.audio-player');
@@ -690,6 +695,7 @@ function handleAudioResponseChunk(data) {
        // Create a new message for the AI response
        const aiMessage = document.createElement('div');
        aiMessage.className = 'message ai';
+        streamingAudio.messageElement = aiMessage;
        
        if (streamingAudio.text) {
            const textElement = document.createElement('p');