Merge branch 'main' of https://github.com/GamerBoss101/HooHacks-12
This commit is contained in:
@@ -274,7 +274,6 @@
|
|||||||
const whisperStatus = document.getElementById('whisper-status');
|
const whisperStatus = document.getElementById('whisper-status');
|
||||||
const csmStatus = document.getElementById('csm-status');
|
const csmStatus = document.getElementById('csm-status');
|
||||||
const llmStatus = document.getElementById('llm-status');
|
const llmStatus = document.getElementById('llm-status');
|
||||||
const webrtcStatus = document.getElementById('webrtc-status');
|
|
||||||
const micAnimation = document.getElementById('mic-animation');
|
const micAnimation = document.getElementById('mic-animation');
|
||||||
const loadingDiv = document.getElementById('loading');
|
const loadingDiv = document.getElementById('loading');
|
||||||
const loadingText = document.getElementById('loading-text');
|
const loadingText = document.getElementById('loading-text');
|
||||||
@@ -286,14 +285,7 @@
|
|||||||
let isAiSpeaking = false;
|
let isAiSpeaking = false;
|
||||||
let audioContext;
|
let audioContext;
|
||||||
let mediaStream;
|
let mediaStream;
|
||||||
let audioRecorder;
|
|
||||||
let audioProcessor;
|
let audioProcessor;
|
||||||
const audioChunks = [];
|
|
||||||
|
|
||||||
// WebRTC variables
|
|
||||||
let peerConnection;
|
|
||||||
let dataChannel;
|
|
||||||
let hasActiveConnection = false;
|
|
||||||
|
|
||||||
// Audio playback
|
// Audio playback
|
||||||
let audioQueue = [];
|
let audioQueue = [];
|
||||||
@@ -302,7 +294,6 @@
|
|||||||
// Configuration variables
|
// Configuration variables
|
||||||
let serverSampleRate = 24000;
|
let serverSampleRate = 24000;
|
||||||
let clientSampleRate = 44100;
|
let clientSampleRate = 44100;
|
||||||
let iceServers = [];
|
|
||||||
|
|
||||||
// Initialize the application
|
// Initialize the application
|
||||||
initApp();
|
initApp();
|
||||||
@@ -329,7 +320,6 @@
|
|||||||
updateConnectionStatus('disconnected');
|
updateConnectionStatus('disconnected');
|
||||||
isConnected = false;
|
isConnected = false;
|
||||||
cleanupAudio();
|
cleanupAudio();
|
||||||
cleanupWebRTC();
|
|
||||||
});
|
});
|
||||||
|
|
||||||
socket.on('session_ready', (data) => {
|
socket.on('session_ready', (data) => {
|
||||||
@@ -337,11 +327,13 @@
|
|||||||
updateModelStatus(data);
|
updateModelStatus(data);
|
||||||
clientSampleRate = data.client_sample_rate;
|
clientSampleRate = data.client_sample_rate;
|
||||||
serverSampleRate = data.server_sample_rate;
|
serverSampleRate = data.server_sample_rate;
|
||||||
iceServers = data.ice_servers;
|
|
||||||
|
|
||||||
// Initialize WebRTC if models are available
|
// Enable start button if models are available
|
||||||
if (data.whisper_available && data.llm_available) {
|
if (data.whisper_available && data.csm_available) {
|
||||||
initializeWebRTC();
|
startButton.disabled = false;
|
||||||
|
addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
|
||||||
|
} else {
|
||||||
|
addInfoMessage('Some models are not available. Voice chat might not work properly.');
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -351,10 +343,6 @@
|
|||||||
addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
|
addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
|
||||||
});
|
});
|
||||||
|
|
||||||
socket.on('webrtc_signal', (data) => {
|
|
||||||
handleWebRTCSignal(data);
|
|
||||||
});
|
|
||||||
|
|
||||||
socket.on('transcription', (data) => {
|
socket.on('transcription', (data) => {
|
||||||
console.log('Transcription:', data);
|
console.log('Transcription:', data);
|
||||||
addUserMessage(data.text);
|
addUserMessage(data.text);
|
||||||
@@ -460,98 +448,6 @@
|
|||||||
llmStatus.style.color = data.llm_available ? 'green' : 'red';
|
llmStatus.style.color = data.llm_available ? 'green' : 'red';
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize WebRTC connection
|
|
||||||
function initializeWebRTC() {
|
|
||||||
if (!isConnected) return;
|
|
||||||
|
|
||||||
const configuration = {
|
|
||||||
iceServers: iceServers
|
|
||||||
};
|
|
||||||
|
|
||||||
peerConnection = new RTCPeerConnection(configuration);
|
|
||||||
|
|
||||||
// Create data channel for WebRTC communication
|
|
||||||
dataChannel = peerConnection.createDataChannel('audioData', {
|
|
||||||
ordered: true
|
|
||||||
});
|
|
||||||
|
|
||||||
dataChannel.onopen = () => {
|
|
||||||
console.log('WebRTC data channel open');
|
|
||||||
hasActiveConnection = true;
|
|
||||||
webrtcStatus.textContent = 'Connected';
|
|
||||||
webrtcStatus.style.color = 'green';
|
|
||||||
socket.emit('webrtc_connected', { status: 'connected' });
|
|
||||||
};
|
|
||||||
|
|
||||||
dataChannel.onclose = () => {
|
|
||||||
console.log('WebRTC data channel closed');
|
|
||||||
hasActiveConnection = false;
|
|
||||||
webrtcStatus.textContent = 'Disconnected';
|
|
||||||
webrtcStatus.style.color = 'red';
|
|
||||||
};
|
|
||||||
|
|
||||||
// Handle ICE candidates
|
|
||||||
peerConnection.onicecandidate = (event) => {
|
|
||||||
if (event.candidate) {
|
|
||||||
socket.emit('webrtc_signal', {
|
|
||||||
type: 'ice_candidate',
|
|
||||||
candidate: event.candidate
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Log ICE connection state changes
|
|
||||||
peerConnection.oniceconnectionstatechange = () => {
|
|
||||||
console.log('ICE connection state:', peerConnection.iceConnectionState);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Create offer
|
|
||||||
peerConnection.createOffer()
|
|
||||||
.then(offer => peerConnection.setLocalDescription(offer))
|
|
||||||
.then(() => {
|
|
||||||
socket.emit('webrtc_signal', {
|
|
||||||
type: 'offer',
|
|
||||||
sdp: peerConnection.localDescription
|
|
||||||
});
|
|
||||||
})
|
|
||||||
.catch(error => {
|
|
||||||
console.error('Error creating WebRTC offer:', error);
|
|
||||||
webrtcStatus.textContent = 'Failed to Connect';
|
|
||||||
webrtcStatus.style.color = 'red';
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle WebRTC signals from the server
|
|
||||||
function handleWebRTCSignal(data) {
|
|
||||||
if (!peerConnection) return;
|
|
||||||
|
|
||||||
if (data.type === 'answer') {
|
|
||||||
peerConnection.setRemoteDescription(new RTCSessionDescription(data.sdp))
|
|
||||||
.catch(error => console.error('Error setting remote description:', error));
|
|
||||||
}
|
|
||||||
else if (data.type === 'ice_candidate') {
|
|
||||||
peerConnection.addIceCandidate(new RTCIceCandidate(data.candidate))
|
|
||||||
.catch(error => console.error('Error adding ICE candidate:', error));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clean up WebRTC connection
|
|
||||||
function cleanupWebRTC() {
|
|
||||||
if (dataChannel) {
|
|
||||||
dataChannel.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (peerConnection) {
|
|
||||||
peerConnection.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
dataChannel = null;
|
|
||||||
peerConnection = null;
|
|
||||||
hasActiveConnection = false;
|
|
||||||
webrtcStatus.textContent = 'Not Connected';
|
|
||||||
webrtcStatus.style.color = 'red';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Toggle audio listening
|
// Toggle audio listening
|
||||||
function toggleListening() {
|
function toggleListening() {
|
||||||
if (isListening) {
|
if (isListening) {
|
||||||
@@ -648,8 +544,6 @@
|
|||||||
if (audioContext && audioContext.state !== 'closed') {
|
if (audioContext && audioContext.state !== 'closed') {
|
||||||
audioContext.close().catch(error => console.error('Error closing AudioContext:', error));
|
audioContext.close().catch(error => console.error('Error closing AudioContext:', error));
|
||||||
}
|
}
|
||||||
|
|
||||||
audioChunks.length = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert Float32Array to Int16Array for sending to server
|
// Convert Float32Array to Int16Array for sending to server
|
||||||
@@ -669,7 +563,7 @@
|
|||||||
// Convert to base64 for transmission
|
// Convert to base64 for transmission
|
||||||
const base64Audio = arrayBufferToBase64(audioData.buffer);
|
const base64Audio = arrayBufferToBase64(audioData.buffer);
|
||||||
|
|
||||||
// Send via Socket.IO (could use WebRTC's DataChannel for lower latency in production)
|
// Send via Socket.IO
|
||||||
socket.emit('audio_stream', { audio: base64Audio });
|
socket.emit('audio_stream', { audio: base64Audio });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -152,7 +152,10 @@ def index():
|
|||||||
"""Serve the main interface"""
|
"""Serve the main interface"""
|
||||||
return render_template('index.html')
|
return render_template('index.html')
|
||||||
|
|
||||||
|
@app.route('/static/js/voice-chat.js')
|
||||||
|
def serve_voice_chat_js():
|
||||||
|
"""Serve the JavaScript file"""
|
||||||
|
return app.send_static_file('js/voice-chat.js')
|
||||||
|
|
||||||
@socketio.on('connect')
|
@socketio.on('connect')
|
||||||
def handle_connect():
|
def handle_connect():
|
||||||
@@ -180,10 +183,6 @@ def handle_connect():
|
|||||||
'should_interrupt_ai': False,
|
'should_interrupt_ai': False,
|
||||||
'ai_stream_queue': queue.Queue(),
|
'ai_stream_queue': queue.Queue(),
|
||||||
|
|
||||||
# WebRTC status
|
|
||||||
'webrtc_connected': False,
|
|
||||||
'webrtc_peer_id': None,
|
|
||||||
|
|
||||||
# Processing flags
|
# Processing flags
|
||||||
'is_processing': False,
|
'is_processing': False,
|
||||||
'pending_user_audio': None
|
'pending_user_audio': None
|
||||||
@@ -195,10 +194,11 @@ def handle_connect():
|
|||||||
'csm_available': csm_generator is not None,
|
'csm_available': csm_generator is not None,
|
||||||
'llm_available': llm_model is not None,
|
'llm_available': llm_model is not None,
|
||||||
'client_sample_rate': CLIENT_SAMPLE_RATE,
|
'client_sample_rate': CLIENT_SAMPLE_RATE,
|
||||||
'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
|
'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000
|
||||||
'ice_servers': ICE_SERVERS
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
emit('ready_for_speech', {'message': 'Ready to start conversation'})
|
||||||
|
|
||||||
@socketio.on('disconnect')
|
@socketio.on('disconnect')
|
||||||
def handle_disconnect():
|
def handle_disconnect():
|
||||||
"""Handle client disconnection"""
|
"""Handle client disconnection"""
|
||||||
@@ -341,10 +341,10 @@ def on_speech_started(session_id):
|
|||||||
# If AI is speaking, we need to interrupt it
|
# If AI is speaking, we need to interrupt it
|
||||||
if session['is_ai_speaking']:
|
if session['is_ai_speaking']:
|
||||||
session['should_interrupt_ai'] = True
|
session['should_interrupt_ai'] = True
|
||||||
emit('ai_interrupted_by_user', room=session_id)
|
socketio.emit('ai_interrupted_by_user', room=session_id)
|
||||||
|
|
||||||
# Notify client that we detected speech
|
# Notify client that we detected speech
|
||||||
emit('user_speech_start', room=session_id)
|
socketio.emit('user_speech_start', room=session_id)
|
||||||
|
|
||||||
def on_speech_ended(session_id):
|
def on_speech_ended(session_id):
|
||||||
"""Handle end of user speech segment"""
|
"""Handle end of user speech segment"""
|
||||||
@@ -399,12 +399,12 @@ def on_speech_ended(session_id):
|
|||||||
).start()
|
).start()
|
||||||
|
|
||||||
# Notify client that processing has started
|
# Notify client that processing has started
|
||||||
emit('processing_speech', room=session_id)
|
socketio.emit('processing_speech', room=session_id)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error preparing audio: {e}")
|
print(f"Error preparing audio: {e}")
|
||||||
session['is_processing'] = False
|
session['is_processing'] = False
|
||||||
emit('error', {'message': f'Error processing audio: {str(e)}'}, room=session_id)
|
socketio.emit('error', {'message': f'Error processing audio: {str(e)}'}, room=session_id)
|
||||||
|
|
||||||
def process_user_utterance(session_id, audio_path, audio_tensor):
|
def process_user_utterance(session_id, audio_path, audio_tensor):
|
||||||
"""Process user utterance, transcribe and generate response"""
|
"""Process user utterance, transcribe and generate response"""
|
||||||
@@ -427,7 +427,7 @@ def process_user_utterance(session_id, audio_path, audio_tensor):
|
|||||||
|
|
||||||
# Check if we got meaningful text
|
# Check if we got meaningful text
|
||||||
if not user_text or len(user_text.strip()) < 2:
|
if not user_text or len(user_text.strip()) < 2:
|
||||||
emit('no_speech_detected', room=session_id)
|
socketio.emit('no_speech_detected', room=session_id) # CHANGED: emit → socketio.emit
|
||||||
session['is_processing'] = False
|
session['is_processing'] = False
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -448,13 +448,13 @@ def process_user_utterance(session_id, audio_path, audio_tensor):
|
|||||||
})
|
})
|
||||||
|
|
||||||
# Send transcription to client
|
# Send transcription to client
|
||||||
emit('transcription', {'text': user_text}, room=session_id)
|
socketio.emit('transcription', {'text': user_text}, room=session_id) # CHANGED: emit → socketio.emit
|
||||||
|
|
||||||
# Generate AI response
|
# Generate AI response
|
||||||
ai_response = generate_ai_response(user_text, session_id)
|
ai_response = generate_ai_response(user_text, session_id)
|
||||||
|
|
||||||
# Send text response to client
|
# Send text response to client
|
||||||
emit('ai_response_text', {'text': ai_response}, room=session_id)
|
socketio.emit('ai_response_text', {'text': ai_response}, room=session_id) # CHANGED: emit → socketio.emit
|
||||||
|
|
||||||
# Update conversation history
|
# Update conversation history
|
||||||
session['conversation_history'].append({
|
session['conversation_history'].append({
|
||||||
@@ -476,7 +476,7 @@ def process_user_utterance(session_id, audio_path, audio_tensor):
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing utterance: {e}")
|
print(f"Error processing utterance: {e}")
|
||||||
emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
|
socketio.emit('error', {'message': f'Error: {str(e)}'}, room=session_id) # CHANGED: emit → socketio.emit
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Clear processing flag
|
# Clear processing flag
|
||||||
@@ -541,10 +541,12 @@ def generate_ai_response(user_text, session_id):
|
|||||||
inputs = llm_tokenizer(prompt, return_tensors="pt").to(device)
|
inputs = llm_tokenizer(prompt, return_tensors="pt").to(device)
|
||||||
output = llm_model.generate(
|
output = llm_model.generate(
|
||||||
inputs.input_ids,
|
inputs.input_ids,
|
||||||
|
attention_mask=inputs.attention_mask, # Add attention mask
|
||||||
max_new_tokens=100, # Keep responses shorter for voice
|
max_new_tokens=100, # Keep responses shorter for voice
|
||||||
temperature=0.7,
|
temperature=0.7,
|
||||||
top_p=0.9,
|
top_p=0.9,
|
||||||
do_sample=True
|
do_sample=True,
|
||||||
|
pad_token_id=llm_tokenizer.eos_token_id # Explicitly set pad_token_id
|
||||||
)
|
)
|
||||||
|
|
||||||
response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
||||||
@@ -587,7 +589,7 @@ def stream_ai_response(text, session_id):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Signal start of AI speech
|
# Signal start of AI speech
|
||||||
emit('ai_speech_start', room=session_id)
|
socketio.emit('ai_speech_start', room=session_id) # CHANGED: emit → socketio.emit
|
||||||
|
|
||||||
# Use the last few conversation segments as context (up to 4)
|
# Use the last few conversation segments as context (up to 4)
|
||||||
context_segments = session['segments'][-4:] if len(session['segments']) > 4 else session['segments']
|
context_segments = session['segments'][-4:] if len(session['segments']) > 4 else session['segments']
|
||||||
@@ -643,15 +645,15 @@ def stream_ai_response(text, session_id):
|
|||||||
if session_id in user_sessions:
|
if session_id in user_sessions:
|
||||||
session['is_ai_speaking'] = False
|
session['is_ai_speaking'] = False
|
||||||
session['is_turn_active'] = False # End conversation turn
|
session['is_turn_active'] = False # End conversation turn
|
||||||
socketio.emit('ai_speech_end', room=session_id)
|
socketio.emit('ai_speech_end', room=session_id) # CHANGED: emit → socketio.emit
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error streaming AI response: {e}")
|
print(f"Error streaming AI response: {e}")
|
||||||
if session_id in user_sessions:
|
if session_id in user_sessions:
|
||||||
session['is_ai_speaking'] = False
|
session['is_ai_speaking'] = False
|
||||||
session['is_turn_active'] = False
|
session['is_turn_active'] = False
|
||||||
socketio.emit('error', {'message': f'Error generating audio: {str(e)}'}, room=session_id)
|
socketio.emit('error', {'message': f'Error generating audio: {str(e)}'}, room=session_id) # CHANGED: emit → socketio.emit
|
||||||
socketio.emit('ai_speech_end', room=session_id)
|
socketio.emit('ai_speech_end', room=session_id) # CHANGED: emit → socketio.emit
|
||||||
|
|
||||||
@socketio.on('interrupt_ai')
|
@socketio.on('interrupt_ai')
|
||||||
def handle_interrupt():
|
def handle_interrupt():
|
||||||
|
|||||||
Reference in New Issue
Block a user