This commit is contained in:
BGV
2025-03-30 11:48:23 -04:00
3 changed files with 33 additions and 135 deletions

View File

@@ -274,7 +274,6 @@
const whisperStatus = document.getElementById('whisper-status'); const whisperStatus = document.getElementById('whisper-status');
const csmStatus = document.getElementById('csm-status'); const csmStatus = document.getElementById('csm-status');
const llmStatus = document.getElementById('llm-status'); const llmStatus = document.getElementById('llm-status');
const webrtcStatus = document.getElementById('webrtc-status');
const micAnimation = document.getElementById('mic-animation'); const micAnimation = document.getElementById('mic-animation');
const loadingDiv = document.getElementById('loading'); const loadingDiv = document.getElementById('loading');
const loadingText = document.getElementById('loading-text'); const loadingText = document.getElementById('loading-text');
@@ -286,14 +285,7 @@
let isAiSpeaking = false; let isAiSpeaking = false;
let audioContext; let audioContext;
let mediaStream; let mediaStream;
let audioRecorder;
let audioProcessor; let audioProcessor;
const audioChunks = [];
// WebRTC variables
let peerConnection;
let dataChannel;
let hasActiveConnection = false;
// Audio playback // Audio playback
let audioQueue = []; let audioQueue = [];
@@ -302,7 +294,6 @@
// Configuration variables // Configuration variables
let serverSampleRate = 24000; let serverSampleRate = 24000;
let clientSampleRate = 44100; let clientSampleRate = 44100;
let iceServers = [];
// Initialize the application // Initialize the application
initApp(); initApp();
@@ -329,7 +320,6 @@
updateConnectionStatus('disconnected'); updateConnectionStatus('disconnected');
isConnected = false; isConnected = false;
cleanupAudio(); cleanupAudio();
cleanupWebRTC();
}); });
socket.on('session_ready', (data) => { socket.on('session_ready', (data) => {
@@ -337,11 +327,13 @@
updateModelStatus(data); updateModelStatus(data);
clientSampleRate = data.client_sample_rate; clientSampleRate = data.client_sample_rate;
serverSampleRate = data.server_sample_rate; serverSampleRate = data.server_sample_rate;
iceServers = data.ice_servers;
// Initialize WebRTC if models are available // Enable start button if models are available
if (data.whisper_available && data.llm_available) { if (data.whisper_available && data.csm_available) {
initializeWebRTC(); startButton.disabled = false;
addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
} else {
addInfoMessage('Some models are not available. Voice chat might not work properly.');
} }
}); });
@@ -351,10 +343,6 @@
addInfoMessage('Ready for conversation. Click "Start Listening" to begin.'); addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
}); });
socket.on('webrtc_signal', (data) => {
handleWebRTCSignal(data);
});
socket.on('transcription', (data) => { socket.on('transcription', (data) => {
console.log('Transcription:', data); console.log('Transcription:', data);
addUserMessage(data.text); addUserMessage(data.text);
@@ -460,98 +448,6 @@
llmStatus.style.color = data.llm_available ? 'green' : 'red'; llmStatus.style.color = data.llm_available ? 'green' : 'red';
} }
// Initialize WebRTC connection
function initializeWebRTC() {
if (!isConnected) return;
const configuration = {
iceServers: iceServers
};
peerConnection = new RTCPeerConnection(configuration);
// Create data channel for WebRTC communication
dataChannel = peerConnection.createDataChannel('audioData', {
ordered: true
});
dataChannel.onopen = () => {
console.log('WebRTC data channel open');
hasActiveConnection = true;
webrtcStatus.textContent = 'Connected';
webrtcStatus.style.color = 'green';
socket.emit('webrtc_connected', { status: 'connected' });
};
dataChannel.onclose = () => {
console.log('WebRTC data channel closed');
hasActiveConnection = false;
webrtcStatus.textContent = 'Disconnected';
webrtcStatus.style.color = 'red';
};
// Handle ICE candidates
peerConnection.onicecandidate = (event) => {
if (event.candidate) {
socket.emit('webrtc_signal', {
type: 'ice_candidate',
candidate: event.candidate
});
}
};
// Log ICE connection state changes
peerConnection.oniceconnectionstatechange = () => {
console.log('ICE connection state:', peerConnection.iceConnectionState);
};
// Create offer
peerConnection.createOffer()
.then(offer => peerConnection.setLocalDescription(offer))
.then(() => {
socket.emit('webrtc_signal', {
type: 'offer',
sdp: peerConnection.localDescription
});
})
.catch(error => {
console.error('Error creating WebRTC offer:', error);
webrtcStatus.textContent = 'Failed to Connect';
webrtcStatus.style.color = 'red';
});
}
// Handle WebRTC signals from the server
function handleWebRTCSignal(data) {
if (!peerConnection) return;
if (data.type === 'answer') {
peerConnection.setRemoteDescription(new RTCSessionDescription(data.sdp))
.catch(error => console.error('Error setting remote description:', error));
}
else if (data.type === 'ice_candidate') {
peerConnection.addIceCandidate(new RTCIceCandidate(data.candidate))
.catch(error => console.error('Error adding ICE candidate:', error));
}
}
// Clean up WebRTC connection
function cleanupWebRTC() {
if (dataChannel) {
dataChannel.close();
}
if (peerConnection) {
peerConnection.close();
}
dataChannel = null;
peerConnection = null;
hasActiveConnection = false;
webrtcStatus.textContent = 'Not Connected';
webrtcStatus.style.color = 'red';
}
// Toggle audio listening // Toggle audio listening
function toggleListening() { function toggleListening() {
if (isListening) { if (isListening) {
@@ -648,8 +544,6 @@
if (audioContext && audioContext.state !== 'closed') { if (audioContext && audioContext.state !== 'closed') {
audioContext.close().catch(error => console.error('Error closing AudioContext:', error)); audioContext.close().catch(error => console.error('Error closing AudioContext:', error));
} }
audioChunks.length = 0;
} }
// Convert Float32Array to Int16Array for sending to server // Convert Float32Array to Int16Array for sending to server
@@ -669,7 +563,7 @@
// Convert to base64 for transmission // Convert to base64 for transmission
const base64Audio = arrayBufferToBase64(audioData.buffer); const base64Audio = arrayBufferToBase64(audioData.buffer);
// Send via Socket.IO (could use WebRTC's DataChannel for lower latency in production) // Send via Socket.IO
socket.emit('audio_stream', { audio: base64Audio }); socket.emit('audio_stream', { audio: base64Audio });
} }

View File

@@ -152,7 +152,10 @@ def index():
"""Serve the main interface""" """Serve the main interface"""
return render_template('index.html') return render_template('index.html')
@app.route('/static/js/voice-chat.js')
def serve_voice_chat_js():
"""Serve the JavaScript file"""
return app.send_static_file('js/voice-chat.js')
@socketio.on('connect') @socketio.on('connect')
def handle_connect(): def handle_connect():
@@ -180,10 +183,6 @@ def handle_connect():
'should_interrupt_ai': False, 'should_interrupt_ai': False,
'ai_stream_queue': queue.Queue(), 'ai_stream_queue': queue.Queue(),
# WebRTC status
'webrtc_connected': False,
'webrtc_peer_id': None,
# Processing flags # Processing flags
'is_processing': False, 'is_processing': False,
'pending_user_audio': None 'pending_user_audio': None
@@ -195,9 +194,10 @@ def handle_connect():
'csm_available': csm_generator is not None, 'csm_available': csm_generator is not None,
'llm_available': llm_model is not None, 'llm_available': llm_model is not None,
'client_sample_rate': CLIENT_SAMPLE_RATE, 'client_sample_rate': CLIENT_SAMPLE_RATE,
'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000, 'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000
'ice_servers': ICE_SERVERS
}) })
emit('ready_for_speech', {'message': 'Ready to start conversation'})
@socketio.on('disconnect') @socketio.on('disconnect')
def handle_disconnect(): def handle_disconnect():
@@ -341,10 +341,10 @@ def on_speech_started(session_id):
# If AI is speaking, we need to interrupt it # If AI is speaking, we need to interrupt it
if session['is_ai_speaking']: if session['is_ai_speaking']:
session['should_interrupt_ai'] = True session['should_interrupt_ai'] = True
emit('ai_interrupted_by_user', room=session_id) socketio.emit('ai_interrupted_by_user', room=session_id)
# Notify client that we detected speech # Notify client that we detected speech
emit('user_speech_start', room=session_id) socketio.emit('user_speech_start', room=session_id)
def on_speech_ended(session_id): def on_speech_ended(session_id):
"""Handle end of user speech segment""" """Handle end of user speech segment"""
@@ -399,12 +399,12 @@ def on_speech_ended(session_id):
).start() ).start()
# Notify client that processing has started # Notify client that processing has started
emit('processing_speech', room=session_id) socketio.emit('processing_speech', room=session_id)
except Exception as e: except Exception as e:
print(f"Error preparing audio: {e}") print(f"Error preparing audio: {e}")
session['is_processing'] = False session['is_processing'] = False
emit('error', {'message': f'Error processing audio: {str(e)}'}, room=session_id) socketio.emit('error', {'message': f'Error processing audio: {str(e)}'}, room=session_id)
def process_user_utterance(session_id, audio_path, audio_tensor): def process_user_utterance(session_id, audio_path, audio_tensor):
"""Process user utterance, transcribe and generate response""" """Process user utterance, transcribe and generate response"""
@@ -427,7 +427,7 @@ def process_user_utterance(session_id, audio_path, audio_tensor):
# Check if we got meaningful text # Check if we got meaningful text
if not user_text or len(user_text.strip()) < 2: if not user_text or len(user_text.strip()) < 2:
emit('no_speech_detected', room=session_id) socketio.emit('no_speech_detected', room=session_id) # CHANGED: emit → socketio.emit
session['is_processing'] = False session['is_processing'] = False
return return
@@ -448,13 +448,13 @@ def process_user_utterance(session_id, audio_path, audio_tensor):
}) })
# Send transcription to client # Send transcription to client
emit('transcription', {'text': user_text}, room=session_id) socketio.emit('transcription', {'text': user_text}, room=session_id) # CHANGED: emit → socketio.emit
# Generate AI response # Generate AI response
ai_response = generate_ai_response(user_text, session_id) ai_response = generate_ai_response(user_text, session_id)
# Send text response to client # Send text response to client
emit('ai_response_text', {'text': ai_response}, room=session_id) socketio.emit('ai_response_text', {'text': ai_response}, room=session_id) # CHANGED: emit → socketio.emit
# Update conversation history # Update conversation history
session['conversation_history'].append({ session['conversation_history'].append({
@@ -476,7 +476,7 @@ def process_user_utterance(session_id, audio_path, audio_tensor):
except Exception as e: except Exception as e:
print(f"Error processing utterance: {e}") print(f"Error processing utterance: {e}")
emit('error', {'message': f'Error: {str(e)}'}, room=session_id) socketio.emit('error', {'message': f'Error: {str(e)}'}, room=session_id) # CHANGED: emit → socketio.emit
finally: finally:
# Clear processing flag # Clear processing flag
@@ -540,11 +540,13 @@ def generate_ai_response(user_text, session_id):
# Generate response # Generate response
inputs = llm_tokenizer(prompt, return_tensors="pt").to(device) inputs = llm_tokenizer(prompt, return_tensors="pt").to(device)
output = llm_model.generate( output = llm_model.generate(
inputs.input_ids, inputs.input_ids,
attention_mask=inputs.attention_mask, # Add attention mask
max_new_tokens=100, # Keep responses shorter for voice max_new_tokens=100, # Keep responses shorter for voice
temperature=0.7, temperature=0.7,
top_p=0.9, top_p=0.9,
do_sample=True do_sample=True,
pad_token_id=llm_tokenizer.eos_token_id # Explicitly set pad_token_id
) )
response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
@@ -587,7 +589,7 @@ def stream_ai_response(text, session_id):
try: try:
# Signal start of AI speech # Signal start of AI speech
emit('ai_speech_start', room=session_id) socketio.emit('ai_speech_start', room=session_id) # CHANGED: emit → socketio.emit
# Use the last few conversation segments as context (up to 4) # Use the last few conversation segments as context (up to 4)
context_segments = session['segments'][-4:] if len(session['segments']) > 4 else session['segments'] context_segments = session['segments'][-4:] if len(session['segments']) > 4 else session['segments']
@@ -643,15 +645,15 @@ def stream_ai_response(text, session_id):
if session_id in user_sessions: if session_id in user_sessions:
session['is_ai_speaking'] = False session['is_ai_speaking'] = False
session['is_turn_active'] = False # End conversation turn session['is_turn_active'] = False # End conversation turn
socketio.emit('ai_speech_end', room=session_id) socketio.emit('ai_speech_end', room=session_id) # CHANGED: emit → socketio.emit
except Exception as e: except Exception as e:
print(f"Error streaming AI response: {e}") print(f"Error streaming AI response: {e}")
if session_id in user_sessions: if session_id in user_sessions:
session['is_ai_speaking'] = False session['is_ai_speaking'] = False
session['is_turn_active'] = False session['is_turn_active'] = False
socketio.emit('error', {'message': f'Error generating audio: {str(e)}'}, room=session_id) socketio.emit('error', {'message': f'Error generating audio: {str(e)}'}, room=session_id) # CHANGED: emit → socketio.emit
socketio.emit('ai_speech_end', room=session_id) socketio.emit('ai_speech_end', room=session_id) # CHANGED: emit → socketio.emit
@socketio.on('interrupt_ai') @socketio.on('interrupt_ai')
def handle_interrupt(): def handle_interrupt():

View File

@@ -1 +1,3 @@
# HooHacks-12 # HooHacks-12
Link to graph: https://docs.google.com/drawings/d/1kRQvTaMHf-dSycMcfUhGtug4g9vPiEZEIeLcZqWd6Nc/edit