Server and Client Side update
This commit is contained in:
@@ -1,9 +1,13 @@
|
||||
/Backend/index.html -->
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Sesame AI Voice Chat</title>
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
||||
<!-- Socket.IO client library -->
|
||||
<script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
|
||||
<style>
|
||||
body {
|
||||
font-family: 'Arial', sans-serif;
|
||||
@@ -11,6 +15,12 @@
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f9f9f9;
|
||||
color: #333;
|
||||
}
|
||||
h1 {
|
||||
text-align: center;
|
||||
margin-bottom: 20px;
|
||||
color: #1a73e8;
|
||||
}
|
||||
.conversation {
|
||||
border: 1px solid #ddd;
|
||||
@@ -21,6 +31,7 @@
|
||||
margin-bottom: 20px;
|
||||
background-color: white;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.05);
|
||||
scroll-behavior: smooth;
|
||||
}
|
||||
.message {
|
||||
margin-bottom: 15px;
|
||||
@@ -28,6 +39,7 @@
|
||||
border-radius: 12px;
|
||||
max-width: 80%;
|
||||
line-height: 1.4;
|
||||
animation: message-appear 0.3s ease-out;
|
||||
}
|
||||
.user {
|
||||
background-color: #e3f2fd;
|
||||
@@ -55,6 +67,7 @@
|
||||
gap: 15px;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
button {
|
||||
padding: 12px 24px;
|
||||
@@ -66,11 +79,20 @@
|
||||
font-weight: bold;
|
||||
transition: all 0.2s ease;
|
||||
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 8px;
|
||||
}
|
||||
button:hover {
|
||||
background-color: #45a049;
|
||||
box-shadow: 0 4px 8px rgba(0,0,0,0.15);
|
||||
}
|
||||
button:disabled {
|
||||
background-color: #cccccc;
|
||||
cursor: not-allowed;
|
||||
opacity: 0.7;
|
||||
}
|
||||
.recording {
|
||||
background-color: #f44336;
|
||||
animation: pulse 1.5s infinite;
|
||||
@@ -94,6 +116,10 @@
|
||||
50% { opacity: 0.7; }
|
||||
100% { opacity: 1; }
|
||||
}
|
||||
@keyframes message-appear {
|
||||
from { opacity: 0; transform: translateY(10px); }
|
||||
to { opacity: 1; transform: translateY(0); }
|
||||
}
|
||||
.status-indicator {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
@@ -106,6 +132,7 @@
|
||||
height: 10px;
|
||||
border-radius: 50%;
|
||||
background-color: #ccc;
|
||||
transition: background-color 0.3s ease;
|
||||
}
|
||||
.status-dot.active {
|
||||
background-color: #4CAF50;
|
||||
@@ -117,6 +144,7 @@
|
||||
audio {
|
||||
width: 100%;
|
||||
margin-top: 5px;
|
||||
border-radius: 8px;
|
||||
}
|
||||
.visualizer-container {
|
||||
width: 100%;
|
||||
@@ -126,14 +154,13 @@
|
||||
margin-bottom: 15px;
|
||||
overflow: hidden;
|
||||
position: relative;
|
||||
box-shadow: inset 0 1px 3px rgba(0,0,0,0.1);
|
||||
}
|
||||
|
||||
.audio-visualizer {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.visualizer-label {
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
@@ -145,6 +172,21 @@
|
||||
opacity: 0.7;
|
||||
text-align: center;
|
||||
width: 100%;
|
||||
transition: opacity 0.3s ease;
|
||||
}
|
||||
.conversation::-webkit-scrollbar {
|
||||
width: 8px;
|
||||
}
|
||||
.conversation::-webkit-scrollbar-track {
|
||||
background: #f1f1f1;
|
||||
border-radius: 10px;
|
||||
}
|
||||
.conversation::-webkit-scrollbar-thumb {
|
||||
background: #ccc;
|
||||
border-radius: 10px;
|
||||
}
|
||||
.conversation::-webkit-scrollbar-thumb:hover {
|
||||
background: #aaa;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
@@ -162,8 +204,8 @@
|
||||
<option value="0">Speaker 0</option>
|
||||
<option value="1">Speaker 1</option>
|
||||
</select>
|
||||
<button id="streamButton">Start Conversation</button>
|
||||
<button id="clearButton">Clear Chat</button>
|
||||
<button id="streamButton"><i class="fas fa-microphone"></i> Start Conversation</button>
|
||||
<button id="clearButton"><i class="fas fa-trash"></i> Clear Chat</button>
|
||||
</div>
|
||||
|
||||
<div class="status-indicator">
|
||||
@@ -173,7 +215,7 @@
|
||||
|
||||
<script>
|
||||
// Variables
|
||||
let ws;
|
||||
let socket;
|
||||
let audioContext;
|
||||
let streamProcessor;
|
||||
let isStreaming = false;
|
||||
@@ -184,14 +226,13 @@
|
||||
const CLIENT_SILENCE_THRESHOLD = 0.01;
|
||||
const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second
|
||||
|
||||
// Add these variables with your existing ones
|
||||
// Visualizer variables
|
||||
let analyser;
|
||||
let visualizerCanvas;
|
||||
let canvasContext;
|
||||
let visualizerBufferLength;
|
||||
let visualizerDataArray;
|
||||
let visualizerAnimationFrame;
|
||||
const visualizerLabel = document.getElementById('visualizerLabel');
|
||||
|
||||
// DOM elements
|
||||
const conversationEl = document.getElementById('conversation');
|
||||
@@ -200,93 +241,150 @@
|
||||
const clearButton = document.getElementById('clearButton');
|
||||
const statusDot = document.getElementById('statusDot');
|
||||
const statusText = document.getElementById('statusText');
|
||||
const visualizerLabel = document.getElementById('visualizerLabel');
|
||||
|
||||
// Initialize on page load
|
||||
window.addEventListener('load', () => {
|
||||
connectWebSocket();
|
||||
// Initialize audio context
|
||||
setupAudioContext();
|
||||
|
||||
// Setup visualization
|
||||
setupVisualizer();
|
||||
|
||||
// Event listeners
|
||||
// Connect to Socket.IO server
|
||||
connectSocketIO();
|
||||
|
||||
// Add event listeners
|
||||
streamButton.addEventListener('click', toggleStreaming);
|
||||
clearButton.addEventListener('click', clearConversation);
|
||||
});
|
||||
|
||||
// Setup audio context for streaming
|
||||
// Setup audio context
|
||||
function setupAudioContext() {
|
||||
try {
|
||||
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
console.log('Audio context setup completed');
|
||||
console.log('Audio context initialized');
|
||||
} catch (err) {
|
||||
console.error('Error setting up audio context:', err);
|
||||
addSystemMessage(`Audio context error: ${err.message}`);
|
||||
streamButton.disabled = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Connect to WebSocket server
|
||||
function connectWebSocket() {
|
||||
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||||
const wsUrl = `${wsProtocol}//${window.location.hostname}:8000/ws`;
|
||||
// Setup the audio visualizer
|
||||
function setupVisualizer() {
|
||||
visualizerCanvas = document.getElementById('audioVisualizer');
|
||||
canvasContext = visualizerCanvas.getContext('2d');
|
||||
|
||||
ws = new WebSocket(wsUrl);
|
||||
// Set canvas size to match container
|
||||
function resizeCanvas() {
|
||||
const container = visualizerCanvas.parentElement;
|
||||
visualizerCanvas.width = container.clientWidth;
|
||||
visualizerCanvas.height = container.clientHeight;
|
||||
}
|
||||
|
||||
ws.onopen = () => {
|
||||
console.log('WebSocket connected');
|
||||
// Call initially and on window resize
|
||||
resizeCanvas();
|
||||
window.addEventListener('resize', resizeCanvas);
|
||||
|
||||
// Create placeholder data array
|
||||
visualizerBufferLength = 128;
|
||||
visualizerDataArray = new Uint8Array(visualizerBufferLength);
|
||||
}
|
||||
|
||||
// Connect to Socket.IO server
|
||||
function connectSocketIO() {
|
||||
// Use the server URL with or without a specific port
|
||||
const serverUrl = window.location.origin;
|
||||
|
||||
console.log(`Connecting to Socket.IO server at ${serverUrl}`);
|
||||
socket = io(serverUrl, {
|
||||
reconnectionDelay: 1000,
|
||||
reconnectionDelayMax: 5000,
|
||||
reconnectionAttempts: Infinity
|
||||
});
|
||||
|
||||
// Socket.IO event handlers
|
||||
socket.on('connect', () => {
|
||||
console.log('Connected to Socket.IO server');
|
||||
statusDot.classList.add('active');
|
||||
statusText.textContent = 'Connected';
|
||||
addSystemMessage('Connected to server');
|
||||
};
|
||||
streamButton.disabled = false;
|
||||
});
|
||||
|
||||
ws.onmessage = (event) => {
|
||||
const response = JSON.parse(event.data);
|
||||
console.log('Received:', response);
|
||||
|
||||
if (response.type === 'audio_response') {
|
||||
// Play audio response
|
||||
const audio = new Audio(response.audio);
|
||||
audio.play();
|
||||
|
||||
// Add message to conversation
|
||||
addAIMessage(response.text || 'AI response', response.audio);
|
||||
|
||||
// Reset to speaking state after AI response
|
||||
if (isStreaming) {
|
||||
streamButton.textContent = 'Listening...';
|
||||
streamButton.style.backgroundColor = '#f44336'; // Back to red
|
||||
streamButton.classList.add('recording');
|
||||
isSpeaking = false; // Reset speaking state
|
||||
}
|
||||
} else if (response.type === 'error') {
|
||||
addSystemMessage(`Error: ${response.message}`);
|
||||
} else if (response.type === 'context_updated') {
|
||||
addSystemMessage(response.message);
|
||||
} else if (response.type === 'streaming_status') {
|
||||
addSystemMessage(`Streaming ${response.status}`);
|
||||
} else if (response.type === 'transcription') {
|
||||
addUserTranscription(response.text);
|
||||
}
|
||||
};
|
||||
|
||||
ws.onclose = () => {
|
||||
console.log('WebSocket disconnected');
|
||||
socket.on('disconnect', () => {
|
||||
console.log('Disconnected from Socket.IO server');
|
||||
statusDot.classList.remove('active');
|
||||
statusText.textContent = 'Disconnected';
|
||||
addSystemMessage('Disconnected from server. Reconnecting...');
|
||||
setTimeout(connectWebSocket, 3000);
|
||||
};
|
||||
addSystemMessage('Disconnected from server');
|
||||
streamButton.disabled = true;
|
||||
|
||||
ws.onerror = (error) => {
|
||||
console.error('WebSocket error:', error);
|
||||
// Stop streaming if active
|
||||
if (isStreaming) {
|
||||
stopStreaming(false); // false = don't send to server
|
||||
}
|
||||
});
|
||||
|
||||
socket.on('status', (data) => {
|
||||
console.log('Status update:', data);
|
||||
addSystemMessage(data.message);
|
||||
});
|
||||
|
||||
socket.on('error', (data) => {
|
||||
console.error('Server error:', data);
|
||||
addSystemMessage(`Error: ${data.message}`);
|
||||
});
|
||||
|
||||
socket.on('audio_response', (data) => {
|
||||
console.log('Received audio response');
|
||||
|
||||
// Play audio response
|
||||
const audio = new Audio(data.audio);
|
||||
audio.play();
|
||||
|
||||
// Add message to conversation
|
||||
addAIMessage(data.text || 'AI response', data.audio);
|
||||
|
||||
// Reset UI state after AI response
|
||||
if (isStreaming) {
|
||||
streamButton.textContent = 'Listening...';
|
||||
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
|
||||
streamButton.style.backgroundColor = '#f44336';
|
||||
streamButton.classList.add('recording');
|
||||
streamButton.classList.remove('processing');
|
||||
isSpeaking = false; // Reset speaking state
|
||||
}
|
||||
});
|
||||
|
||||
socket.on('transcription', (data) => {
|
||||
console.log('Received transcription:', data);
|
||||
addUserTranscription(data.text);
|
||||
});
|
||||
|
||||
socket.on('context_updated', (data) => {
|
||||
console.log('Context updated:', data);
|
||||
addSystemMessage(data.message);
|
||||
});
|
||||
|
||||
socket.on('streaming_status', (data) => {
|
||||
console.log('Streaming status:', data);
|
||||
addSystemMessage(`Streaming ${data.status}`);
|
||||
});
|
||||
|
||||
socket.on('connect_error', (error) => {
|
||||
console.error('Connection error:', error);
|
||||
statusDot.classList.remove('active');
|
||||
statusText.textContent = 'Error';
|
||||
addSystemMessage('Connection error');
|
||||
};
|
||||
statusText.textContent = 'Connection Error';
|
||||
addSystemMessage('Failed to connect to server');
|
||||
streamButton.disabled = true;
|
||||
});
|
||||
}
|
||||
|
||||
// Toggle streaming
|
||||
function toggleStreaming() {
|
||||
if (isStreaming) {
|
||||
stopStreaming();
|
||||
stopStreaming(true); // true = send to server
|
||||
} else {
|
||||
startStreaming();
|
||||
}
|
||||
@@ -295,49 +393,52 @@
|
||||
// Start streaming
|
||||
async function startStreaming() {
|
||||
try {
|
||||
// Request microphone access
|
||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
const speaker = parseInt(speakerSelectEl.value);
|
||||
|
||||
// Update state
|
||||
isStreaming = true;
|
||||
isSpeaking = false;
|
||||
energyWindow = [];
|
||||
|
||||
streamButton.textContent = 'Listening...';
|
||||
// Update UI
|
||||
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
|
||||
streamButton.classList.add('recording');
|
||||
|
||||
// Create audio processor node
|
||||
// Setup audio analysis
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
|
||||
// Set up analyser for visualization with better settings
|
||||
// Setup analyzer for visualization
|
||||
analyser = audioContext.createAnalyser();
|
||||
analyser.fftSize = 256;
|
||||
analyser.smoothingTimeConstant = 0.8; // Add smoothing for nicer visualization
|
||||
analyser.smoothingTimeConstant = 0.8;
|
||||
analyser.minDecibels = -90;
|
||||
analyser.maxDecibels = -10;
|
||||
|
||||
visualizerBufferLength = analyser.frequencyBinCount;
|
||||
visualizerDataArray = new Uint8Array(visualizerBufferLength);
|
||||
|
||||
// Connect source to analyzer first
|
||||
// Connect source to analyzer
|
||||
source.connect(analyser);
|
||||
|
||||
// Hide the label when visualization is active
|
||||
// Hide visualizer label
|
||||
visualizerLabel.style.opacity = '0';
|
||||
|
||||
// Start drawing the visualization
|
||||
// Start visualization
|
||||
if (visualizerAnimationFrame) {
|
||||
cancelAnimationFrame(visualizerAnimationFrame);
|
||||
}
|
||||
drawVisualizer();
|
||||
|
||||
// Set up processor for audio processing
|
||||
// Setup audio processor
|
||||
streamProcessor = audioContext.createScriptProcessor(4096, 1, 1);
|
||||
|
||||
// Connect nodes
|
||||
// Connect audio nodes
|
||||
source.connect(streamProcessor);
|
||||
streamProcessor.connect(audioContext.destination);
|
||||
|
||||
// Process and send audio data
|
||||
// Process audio
|
||||
streamProcessor.onaudioprocess = function(e) {
|
||||
const audioData = e.inputBuffer.getChannelData(0);
|
||||
|
||||
@@ -349,10 +450,10 @@
|
||||
const avgEnergy = calculateAverageEnergy();
|
||||
const isSilent = avgEnergy < CLIENT_SILENCE_THRESHOLD;
|
||||
|
||||
// Handle silence/speech transitions for visual feedback
|
||||
// Handle silence/speech transitions
|
||||
handleSpeechState(isSilent);
|
||||
|
||||
// Continue processing audio regardless of silence state
|
||||
// Process and send audio
|
||||
const downsampled = downsampleBuffer(audioData, audioContext.sampleRate, 24000);
|
||||
sendAudioChunk(downsampled, speaker);
|
||||
};
|
||||
@@ -363,8 +464,71 @@
|
||||
console.error('Error starting audio stream:', err);
|
||||
addSystemMessage(`Microphone error: ${err.message}`);
|
||||
isStreaming = false;
|
||||
streamButton.textContent = 'Start Conversation';
|
||||
streamButton.classList.remove('recording');
|
||||
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
|
||||
streamButton.classList.remove('recording', 'processing');
|
||||
}
|
||||
}
|
||||
|
||||
// Stop streaming
|
||||
function stopStreaming(sendToServer = true) {
|
||||
// Disconnect audio nodes
|
||||
if (streamProcessor) {
|
||||
streamProcessor.disconnect();
|
||||
streamProcessor = null;
|
||||
}
|
||||
|
||||
if (analyser) {
|
||||
analyser.disconnect();
|
||||
analyser = null;
|
||||
}
|
||||
|
||||
// Stop visualization
|
||||
if (visualizerAnimationFrame) {
|
||||
cancelAnimationFrame(visualizerAnimationFrame);
|
||||
visualizerAnimationFrame = null;
|
||||
}
|
||||
|
||||
// Clear canvas
|
||||
if (canvasContext) {
|
||||
canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
|
||||
visualizerLabel.style.opacity = '0.7';
|
||||
}
|
||||
|
||||
// Clear silence timer
|
||||
if (silenceTimer) {
|
||||
clearTimeout(silenceTimer);
|
||||
silenceTimer = null;
|
||||
}
|
||||
|
||||
// Reset state
|
||||
isStreaming = false;
|
||||
isSpeaking = false;
|
||||
energyWindow = [];
|
||||
|
||||
// Update UI
|
||||
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
|
||||
streamButton.classList.remove('recording', 'processing');
|
||||
streamButton.style.backgroundColor = '';
|
||||
|
||||
addSystemMessage('Conversation paused');
|
||||
|
||||
// Notify server
|
||||
if (sendToServer && socket.connected) {
|
||||
socket.emit('stop_streaming', {
|
||||
speaker: parseInt(speakerSelectEl.value)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Clear conversation
|
||||
function clearConversation() {
|
||||
// Clear UI
|
||||
conversationEl.innerHTML = '';
|
||||
addSystemMessage('Conversation cleared');
|
||||
|
||||
// Notify server
|
||||
if (socket.connected) {
|
||||
socket.emit('clear_context');
|
||||
}
|
||||
}
|
||||
|
||||
@@ -377,7 +541,7 @@
|
||||
return sum / buffer.length;
|
||||
}
|
||||
|
||||
// Update the sliding energy window
|
||||
// Update energy window
|
||||
function updateEnergyWindow(energy) {
|
||||
energyWindow.push(energy);
|
||||
if (energyWindow.length > ENERGY_WINDOW_SIZE) {
|
||||
@@ -385,20 +549,20 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate average energy from the window
|
||||
// Calculate average energy
|
||||
function calculateAverageEnergy() {
|
||||
if (energyWindow.length === 0) return 0;
|
||||
return energyWindow.reduce((sum, val) => sum + val, 0) / energyWindow.length;
|
||||
}
|
||||
|
||||
// Handle speech state changes and visual feedback
|
||||
// Handle speech state changes
|
||||
function handleSpeechState(isSilent) {
|
||||
if (isSpeaking && isSilent) {
|
||||
// Transition from speaking to silence
|
||||
if (!silenceTimer) {
|
||||
silenceTimer = setTimeout(() => {
|
||||
// Silence persisted long enough
|
||||
streamButton.textContent = 'Processing...';
|
||||
streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
|
||||
streamButton.classList.remove('recording');
|
||||
streamButton.classList.add('processing');
|
||||
addSystemMessage('Detected pause in speech, processing response...');
|
||||
@@ -407,24 +571,24 @@
|
||||
} else if (!isSpeaking && !isSilent) {
|
||||
// Transition from silence to speaking
|
||||
isSpeaking = true;
|
||||
streamButton.textContent = 'Listening...';
|
||||
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
|
||||
streamButton.classList.add('recording');
|
||||
streamButton.classList.remove('processing');
|
||||
|
||||
// Clear any pending silence timer
|
||||
// Clear silence timer
|
||||
if (silenceTimer) {
|
||||
clearTimeout(silenceTimer);
|
||||
silenceTimer = null;
|
||||
}
|
||||
} else if (isSpeaking && !isSilent) {
|
||||
// Still speaking, reset any silence timer
|
||||
// Still speaking, reset silence timer
|
||||
if (silenceTimer) {
|
||||
clearTimeout(silenceTimer);
|
||||
silenceTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
// Update speaking state
|
||||
// Update speaking state for non-silent audio
|
||||
if (!isSilent) {
|
||||
isSpeaking = true;
|
||||
}
|
||||
@@ -432,83 +596,93 @@
|
||||
|
||||
// Send audio chunk to server
|
||||
function sendAudioChunk(audioData, speaker) {
|
||||
if (!socket || !socket.connected) {
|
||||
console.warn('Cannot send audio: socket not connected');
|
||||
return;
|
||||
}
|
||||
|
||||
const wavData = createWavBlob(audioData, 24000);
|
||||
const reader = new FileReader();
|
||||
|
||||
reader.onloadend = function() {
|
||||
const base64data = reader.result;
|
||||
|
||||
// Send to server
|
||||
ws.send(JSON.stringify({
|
||||
action: 'stream_audio',
|
||||
// Send to server using Socket.IO
|
||||
socket.emit('stream_audio', {
|
||||
speaker: speaker,
|
||||
audio: base64data
|
||||
}));
|
||||
});
|
||||
};
|
||||
|
||||
reader.readAsDataURL(wavData);
|
||||
}
|
||||
|
||||
// Stop streaming
|
||||
function stopStreaming() {
|
||||
if (streamProcessor) {
|
||||
streamProcessor.disconnect();
|
||||
streamProcessor = null;
|
||||
// Visualization function
|
||||
function drawVisualizer() {
|
||||
if (!canvasContext) {
|
||||
console.error("Canvas context not available");
|
||||
return;
|
||||
}
|
||||
|
||||
if (analyser) {
|
||||
analyser.disconnect();
|
||||
analyser = null;
|
||||
visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
|
||||
|
||||
// Get frequency data if available
|
||||
if (isStreaming && analyser) {
|
||||
try {
|
||||
analyser.getByteFrequencyData(visualizerDataArray);
|
||||
} catch (e) {
|
||||
console.error("Error getting frequency data:", e);
|
||||
}
|
||||
} else {
|
||||
// Fade out when not streaming
|
||||
for (let i = 0; i < visualizerDataArray.length; i++) {
|
||||
visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
|
||||
}
|
||||
}
|
||||
|
||||
// Stop the visualization
|
||||
if (visualizerAnimationFrame) {
|
||||
cancelAnimationFrame(visualizerAnimationFrame);
|
||||
visualizerAnimationFrame = null;
|
||||
// Clear canvas
|
||||
canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
|
||||
canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
|
||||
|
||||
// Draw bars
|
||||
const width = visualizerCanvas.width;
|
||||
const height = visualizerCanvas.height;
|
||||
const barCount = Math.min(visualizerBufferLength, 64);
|
||||
const barWidth = width / barCount - 1;
|
||||
|
||||
for (let i = 0; i < barCount; i++) {
|
||||
const index = Math.floor(i * visualizerBufferLength / barCount);
|
||||
const value = visualizerDataArray[index];
|
||||
|
||||
const barHeight = (value / 255) * height;
|
||||
const x = i * (barWidth + 1);
|
||||
|
||||
// Color based on frequency
|
||||
const hue = 200 + (i / barCount * 60);
|
||||
const saturation = 90 - (value / 255 * 30);
|
||||
const lightness = 40 + (value / 255 * 30);
|
||||
|
||||
// Draw bar
|
||||
canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
|
||||
canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
|
||||
|
||||
// Add reflection effect
|
||||
const gradientHeight = Math.min(10, barHeight / 3);
|
||||
const gradient = canvasContext.createLinearGradient(
|
||||
0, height - barHeight,
|
||||
0, height - barHeight + gradientHeight
|
||||
);
|
||||
gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
|
||||
gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
|
||||
canvasContext.fillStyle = gradient;
|
||||
canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
|
||||
}
|
||||
|
||||
// Clear the canvas
|
||||
if (canvasContext) {
|
||||
canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
|
||||
visualizerLabel.style.opacity = '0.7';
|
||||
}
|
||||
|
||||
// Clear any pending silence timer
|
||||
if (silenceTimer) {
|
||||
clearTimeout(silenceTimer);
|
||||
silenceTimer = null;
|
||||
}
|
||||
|
||||
isStreaming = false;
|
||||
isSpeaking = false;
|
||||
energyWindow = [];
|
||||
|
||||
streamButton.textContent = 'Start Conversation';
|
||||
streamButton.classList.remove('recording', 'processing');
|
||||
streamButton.style.backgroundColor = ''; // Reset to default
|
||||
|
||||
addSystemMessage('Conversation paused');
|
||||
|
||||
// Send stop streaming signal to server
|
||||
ws.send(JSON.stringify({
|
||||
action: 'stop_streaming',
|
||||
speaker: parseInt(speakerSelectEl.value)
|
||||
}));
|
||||
// Show/hide the label
|
||||
visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
|
||||
}
|
||||
|
||||
// Clear conversation
|
||||
function clearConversation() {
|
||||
// Clear conversation history
|
||||
ws.send(JSON.stringify({
|
||||
action: 'clear_context'
|
||||
}));
|
||||
|
||||
// Clear the UI
|
||||
conversationEl.innerHTML = '';
|
||||
addSystemMessage('Conversation cleared');
|
||||
}
|
||||
|
||||
// Downsample audio buffer to target sample rate
|
||||
// Downsample audio buffer
|
||||
function downsampleBuffer(buffer, sampleRate, targetSampleRate) {
|
||||
if (targetSampleRate === sampleRate) {
|
||||
return buffer;
|
||||
@@ -538,7 +712,7 @@
|
||||
return result;
|
||||
}
|
||||
|
||||
// Create WAV blob from Float32Array
|
||||
// Create WAV blob
|
||||
function createWavBlob(samples, sampleRate) {
|
||||
const buffer = new ArrayBuffer(44 + samples.length * 2);
|
||||
const view = new DataView(buffer);
|
||||
@@ -562,8 +736,7 @@
|
||||
writeString(view, 36, 'data');
|
||||
view.setUint32(40, samples.length * 2, true);
|
||||
|
||||
// Write the PCM samples
|
||||
const volume = 0.5;
|
||||
// Write PCM samples
|
||||
for (let i = 0; i < samples.length; i++) {
|
||||
const sample = Math.max(-1, Math.min(1, samples[i]));
|
||||
view.setInt16(44 + i * 2, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true);
|
||||
@@ -572,19 +745,19 @@
|
||||
return new Blob([buffer], { type: 'audio/wav' });
|
||||
}
|
||||
|
||||
// Write string to DataView
|
||||
function writeString(view, offset, string) {
|
||||
for (let i = 0; i < string.length; i++) {
|
||||
view.setUint8(offset + i, string.charCodeAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Message display functions
|
||||
// Add user transcription
|
||||
function addUserTranscription(text) {
|
||||
// Find if there's already a pending user message
|
||||
// Find or create user message
|
||||
let pendingMessage = document.querySelector('.message.user.pending');
|
||||
|
||||
if (!pendingMessage) {
|
||||
// Create a new message
|
||||
pendingMessage = document.createElement('div');
|
||||
pendingMessage.classList.add('message', 'user', 'pending');
|
||||
conversationEl.appendChild(pendingMessage);
|
||||
@@ -595,6 +768,7 @@
|
||||
conversationEl.scrollTop = conversationEl.scrollHeight;
|
||||
}
|
||||
|
||||
// Add AI message
|
||||
function addAIMessage(text, audioSrc) {
|
||||
const messageEl = document.createElement('div');
|
||||
messageEl.classList.add('message', 'ai');
|
||||
@@ -614,6 +788,7 @@
|
||||
conversationEl.scrollTop = conversationEl.scrollHeight;
|
||||
}
|
||||
|
||||
// Add system message
|
||||
function addSystemMessage(text) {
|
||||
const messageEl = document.createElement('div');
|
||||
messageEl.classList.add('message', 'system');
|
||||
@@ -621,98 +796,6 @@
|
||||
conversationEl.appendChild(messageEl);
|
||||
conversationEl.scrollTop = conversationEl.scrollHeight;
|
||||
}
|
||||
|
||||
// Setup the audio visualizer
|
||||
function setupVisualizer() {
|
||||
visualizerCanvas = document.getElementById('audioVisualizer');
|
||||
canvasContext = visualizerCanvas.getContext('2d');
|
||||
|
||||
// Set canvas size to match container
|
||||
function resizeCanvas() {
|
||||
const container = visualizerCanvas.parentElement;
|
||||
visualizerCanvas.width = container.clientWidth;
|
||||
visualizerCanvas.height = container.clientHeight;
|
||||
}
|
||||
|
||||
// Call initially and on window resize
|
||||
resizeCanvas();
|
||||
window.addEventListener('resize', resizeCanvas);
|
||||
|
||||
// Create placeholder data array (will be used before streaming starts)
|
||||
visualizerBufferLength = 128; // Default size
|
||||
visualizerDataArray = new Uint8Array(visualizerBufferLength);
|
||||
}
|
||||
|
||||
// Add the visualization drawing function
|
||||
function drawVisualizer() {
|
||||
// Ensure we have the canvas context
|
||||
if (!canvasContext) {
|
||||
console.error("Canvas context not available");
|
||||
return;
|
||||
}
|
||||
|
||||
visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
|
||||
|
||||
// If we're streaming and have an analyzer, get the frequency data
|
||||
if (isStreaming && analyser) {
|
||||
try {
|
||||
analyser.getByteFrequencyData(visualizerDataArray);
|
||||
} catch (e) {
|
||||
console.error("Error getting frequency data:", e);
|
||||
}
|
||||
} else {
|
||||
// If not streaming, gradually reduce all values to create a fade-out effect
|
||||
for (let i = 0; i < visualizerDataArray.length; i++) {
|
||||
visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
|
||||
}
|
||||
}
|
||||
|
||||
// Clear the canvas with a very slight background
|
||||
canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
|
||||
canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
|
||||
|
||||
// Calculate bar width based on canvas size and buffer length
|
||||
const width = visualizerCanvas.width;
|
||||
const height = visualizerCanvas.height;
|
||||
const barCount = Math.min(visualizerBufferLength, 64); // Limit bars for performance
|
||||
const barWidth = width / barCount - 1; // Leave 1px gap
|
||||
|
||||
// Draw bars
|
||||
for (let i = 0; i < barCount; i++) {
|
||||
// Use a logarithmic scale for better visualization of lower frequencies
|
||||
const index = Math.floor(i * visualizerBufferLength / barCount);
|
||||
const value = visualizerDataArray[index];
|
||||
|
||||
// Scale height (values typically range from 0-255)
|
||||
const barHeight = (value / 255) * height;
|
||||
|
||||
// Position x coordinate
|
||||
const x = i * (barWidth + 1);
|
||||
|
||||
// Calculate gradient color based on frequency
|
||||
const hue = 200 + (i / barCount * 60); // Blue to light-blue/cyan spectrum
|
||||
const saturation = 90 - (value / 255 * 30); // More saturated for louder sounds
|
||||
const lightness = 40 + (value / 255 * 30); // Brighter for louder sounds
|
||||
|
||||
// Draw the bar
|
||||
canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
|
||||
canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
|
||||
|
||||
// Add a subtle reflection
|
||||
const gradientHeight = Math.min(10, barHeight / 3);
|
||||
const gradient = canvasContext.createLinearGradient(
|
||||
0, height - barHeight,
|
||||
0, height - barHeight + gradientHeight
|
||||
);
|
||||
gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
|
||||
gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
|
||||
canvasContext.fillStyle = gradient;
|
||||
canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
|
||||
}
|
||||
|
||||
// Only show the label when not streaming
|
||||
visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,24 +1,20 @@
|
||||
import os
|
||||
import base64
|
||||
import json
|
||||
import asyncio
|
||||
import torch
|
||||
import torchaudio
|
||||
import numpy as np
|
||||
import io
|
||||
import whisperx
|
||||
from io import BytesIO
|
||||
from typing import List, Dict, Any, Optional
|
||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
|
||||
from fastapi.responses import HTMLResponse, FileResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
from flask import Flask, request, send_from_directory, Response
|
||||
from flask_cors import CORS
|
||||
from flask_socketio import SocketIO, emit, disconnect
|
||||
from generator import load_csm_1b, Segment
|
||||
import uvicorn
|
||||
import time
|
||||
import gc
|
||||
from collections import deque
|
||||
from threading import Lock
|
||||
|
||||
# Select device
|
||||
if torch.cuda.is_available():
|
||||
@@ -36,73 +32,39 @@ print("Loading WhisperX model...")
|
||||
asr_model = whisperx.load_model("medium", device, compute_type="float16")
|
||||
print("WhisperX model loaded!")
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Add CORS middleware to allow cross-origin requests
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # Allow all origins in development
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
# Silence detection parameters
|
||||
SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization
|
||||
SILENCE_DURATION_SEC = 1.0 # How long silence must persist
|
||||
|
||||
# Define the base directory
|
||||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Mount a static files directory if you have any static assets like CSS or JS
|
||||
static_dir = os.path.join(base_dir, "static")
|
||||
os.makedirs(static_dir, exist_ok=True) # Create the directory if it doesn't exist
|
||||
app.mount("/static", StaticFiles(directory=static_dir), name="static")
|
||||
os.makedirs(static_dir, exist_ok=True)
|
||||
|
||||
# Define route to serve index.html as the main page
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def get_index():
|
||||
try:
|
||||
with open(os.path.join(base_dir, "index.html"), "r") as f:
|
||||
return HTMLResponse(content=f.read())
|
||||
except FileNotFoundError:
|
||||
return HTMLResponse(content="<html><body><h1>Error: index.html not found</h1></body></html>")
|
||||
# Setup Flask
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
|
||||
|
||||
# Add a favicon endpoint (optional, but good to have)
|
||||
@app.get("/favicon.ico")
|
||||
async def get_favicon():
|
||||
favicon_path = os.path.join(static_dir, "favicon.ico")
|
||||
if os.path.exists(favicon_path):
|
||||
return FileResponse(favicon_path)
|
||||
else:
|
||||
return HTMLResponse(status_code=204) # No content
|
||||
|
||||
# Connection manager to handle multiple clients
|
||||
class ConnectionManager:
|
||||
def __init__(self):
|
||||
self.active_connections: List[WebSocket] = []
|
||||
|
||||
async def connect(self, websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
self.active_connections.append(websocket)
|
||||
|
||||
def disconnect(self, websocket: WebSocket):
|
||||
self.active_connections.remove(websocket)
|
||||
|
||||
manager = ConnectionManager()
|
||||
|
||||
# Silence detection parameters
|
||||
SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization
|
||||
SILENCE_DURATION_SEC = 1.0 # How long silence must persist to be considered "stopped talking"
|
||||
# Socket connection management
|
||||
thread = None
|
||||
thread_lock = Lock()
|
||||
active_clients = {} # Map client_id to client context
|
||||
|
||||
# Helper function to convert audio data
|
||||
async def decode_audio_data(audio_data: str) -> torch.Tensor:
|
||||
def decode_audio_data(audio_data: str) -> torch.Tensor:
|
||||
"""Decode base64 audio data to a torch tensor"""
|
||||
try:
|
||||
# Extract the actual base64 content
|
||||
if ',' in audio_data:
|
||||
audio_data = audio_data.split(',')[1]
|
||||
|
||||
# Decode base64 audio data
|
||||
binary_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data)
|
||||
binary_data = base64.b64decode(audio_data)
|
||||
|
||||
# Save to a temporary WAV file first
|
||||
temp_file = BytesIO(binary_data)
|
||||
|
||||
# Load audio from binary data, explicitly specifying the format
|
||||
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
|
||||
# Load audio from binary data
|
||||
with BytesIO(binary_data) as temp_file:
|
||||
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
|
||||
|
||||
# Resample if needed
|
||||
if sample_rate != generator.sample_rate:
|
||||
@@ -121,7 +83,7 @@ async def decode_audio_data(audio_data: str) -> torch.Tensor:
|
||||
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
|
||||
|
||||
|
||||
async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
|
||||
def encode_audio_data(audio_tensor: torch.Tensor) -> str:
|
||||
"""Encode torch tensor audio to base64 string"""
|
||||
buf = BytesIO()
|
||||
torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
|
||||
@@ -130,40 +92,36 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
|
||||
return f"data:audio/wav;base64,{audio_base64}"
|
||||
|
||||
|
||||
async def transcribe_audio(audio_tensor: torch.Tensor) -> str:
|
||||
def transcribe_audio(audio_tensor: torch.Tensor) -> str:
|
||||
"""Transcribe audio using WhisperX"""
|
||||
try:
|
||||
# Save the tensor to a temporary file
|
||||
temp_file = BytesIO()
|
||||
torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
|
||||
temp_file.seek(0)
|
||||
|
||||
# Create a temporary file on disk (WhisperX requires a file path)
|
||||
temp_path = "temp_audio.wav"
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(temp_file.read())
|
||||
temp_path = os.path.join(base_dir, "temp_audio.wav")
|
||||
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
|
||||
|
||||
# Load and transcribe the audio
|
||||
audio = whisperx.load_audio(temp_path)
|
||||
result = asr_model.transcribe(audio, batch_size=16)
|
||||
|
||||
# Clean up
|
||||
os.remove(temp_path)
|
||||
if os.path.exists(temp_path):
|
||||
os.remove(temp_path)
|
||||
|
||||
# Get the transcription text
|
||||
if result["segments"] and len(result["segments"]) > 0:
|
||||
# Combine all segments
|
||||
transcription = " ".join([segment["text"] for segment in result["segments"]])
|
||||
print(f"Transcription: {transcription}")
|
||||
return transcription.strip()
|
||||
else:
|
||||
return ""
|
||||
except Exception as e:
|
||||
print(f"Error in transcription: {str(e)}")
|
||||
if os.path.exists("temp_audio.wav"):
|
||||
os.remove("temp_audio.wav")
|
||||
return ""
|
||||
|
||||
|
||||
async def generate_response(text: str, conversation_history: List[Segment]) -> str:
|
||||
def generate_response(text: str, conversation_history: List[Segment]) -> str:
|
||||
"""Generate a contextual response based on the transcribed text"""
|
||||
# Simple response logic - can be replaced with a more sophisticated LLM in the future
|
||||
responses = {
|
||||
@@ -191,311 +149,319 @@ async def generate_response(text: str, conversation_history: List[Segment]) -> s
|
||||
else:
|
||||
return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?"
|
||||
|
||||
# Flask routes for serving static content
|
||||
@app.route('/')
|
||||
def index():
|
||||
return send_from_directory(base_dir, 'index.html')
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
await manager.connect(websocket)
|
||||
context_segments = [] # Store conversation context
|
||||
streaming_buffer = [] # Buffer for streaming audio chunks
|
||||
is_streaming = False
|
||||
@app.route('/favicon.ico')
|
||||
def favicon():
|
||||
if os.path.exists(os.path.join(static_dir, 'favicon.ico')):
|
||||
return send_from_directory(static_dir, 'favicon.ico')
|
||||
return Response(status=204)
|
||||
|
||||
# Variables for silence detection
|
||||
last_active_time = time.time()
|
||||
is_silence = False
|
||||
energy_window = deque(maxlen=10) # For tracking recent audio energy
|
||||
@app.route('/static/<path:path>')
|
||||
def serve_static(path):
|
||||
return send_from_directory(static_dir, path)
|
||||
|
||||
# Socket.IO event handlers
|
||||
@socketio.on('connect')
|
||||
def handle_connect():
|
||||
client_id = request.sid
|
||||
print(f"Client connected: {client_id}")
|
||||
|
||||
# Initialize client context
|
||||
active_clients[client_id] = {
|
||||
'context_segments': [],
|
||||
'streaming_buffer': [],
|
||||
'is_streaming': False,
|
||||
'is_silence': False,
|
||||
'last_active_time': time.time(),
|
||||
'energy_window': deque(maxlen=10)
|
||||
}
|
||||
|
||||
emit('status', {'type': 'connected', 'message': 'Connected to server'})
|
||||
|
||||
@socketio.on('disconnect')
|
||||
def handle_disconnect():
|
||||
client_id = request.sid
|
||||
if client_id in active_clients:
|
||||
del active_clients[client_id]
|
||||
print(f"Client disconnected: {client_id}")
|
||||
|
||||
@socketio.on('generate')
|
||||
def handle_generate(data):
|
||||
client_id = request.sid
|
||||
if client_id not in active_clients:
|
||||
emit('error', {'message': 'Client not registered'})
|
||||
return
|
||||
|
||||
try:
|
||||
while True:
|
||||
# Receive JSON data from client
|
||||
data = await websocket.receive_text()
|
||||
request = json.loads(data)
|
||||
text = data.get('text', '')
|
||||
speaker_id = data.get('speaker', 0)
|
||||
|
||||
action = request.get("action")
|
||||
print(f"Generating audio for: '{text}' with speaker {speaker_id}")
|
||||
|
||||
if action == "generate":
|
||||
try:
|
||||
text = request.get("text", "")
|
||||
speaker_id = request.get("speaker", 0)
|
||||
# Generate audio response
|
||||
audio_tensor = generator.generate(
|
||||
text=text,
|
||||
speaker=speaker_id,
|
||||
context=active_clients[client_id]['context_segments'],
|
||||
max_audio_length_ms=10_000,
|
||||
)
|
||||
|
||||
# Generate audio response
|
||||
print(f"Generating audio for: '{text}' with speaker {speaker_id}")
|
||||
audio_tensor = generator.generate(
|
||||
text=text,
|
||||
speaker=speaker_id,
|
||||
context=context_segments,
|
||||
max_audio_length_ms=10_000,
|
||||
)
|
||||
# Add to conversation context
|
||||
active_clients[client_id]['context_segments'].append(
|
||||
Segment(text=text, speaker=speaker_id, audio=audio_tensor)
|
||||
)
|
||||
|
||||
# Add to conversation context
|
||||
context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
|
||||
# Convert audio to base64 and send back to client
|
||||
audio_base64 = encode_audio_data(audio_tensor)
|
||||
emit('audio_response', {
|
||||
'type': 'audio_response',
|
||||
'audio': audio_base64
|
||||
})
|
||||
|
||||
# Convert audio to base64 and send back to client
|
||||
audio_base64 = await encode_audio_data(audio_tensor)
|
||||
await websocket.send_json({
|
||||
"type": "audio_response",
|
||||
"audio": audio_base64
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error generating audio: {str(e)}")
|
||||
await websocket.send_json({
|
||||
"type": "error",
|
||||
"message": f"Error generating audio: {str(e)}"
|
||||
})
|
||||
|
||||
elif action == "add_to_context":
|
||||
try:
|
||||
text = request.get("text", "")
|
||||
speaker_id = request.get("speaker", 0)
|
||||
audio_data = request.get("audio", "")
|
||||
|
||||
# Convert received audio to tensor
|
||||
audio_tensor = await decode_audio_data(audio_data)
|
||||
|
||||
# Add to conversation context
|
||||
context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
|
||||
|
||||
await websocket.send_json({
|
||||
"type": "context_updated",
|
||||
"message": "Audio added to context"
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error adding to context: {str(e)}")
|
||||
await websocket.send_json({
|
||||
"type": "error",
|
||||
"message": f"Error processing audio: {str(e)}"
|
||||
})
|
||||
|
||||
elif action == "clear_context":
|
||||
context_segments = []
|
||||
await websocket.send_json({
|
||||
"type": "context_updated",
|
||||
"message": "Context cleared"
|
||||
})
|
||||
|
||||
elif action == "stream_audio":
|
||||
try:
|
||||
speaker_id = request.get("speaker", 0)
|
||||
audio_data = request.get("audio", "")
|
||||
|
||||
# Convert received audio to tensor
|
||||
audio_chunk = await decode_audio_data(audio_data)
|
||||
|
||||
# Start streaming mode if not already started
|
||||
if not is_streaming:
|
||||
is_streaming = True
|
||||
streaming_buffer = []
|
||||
energy_window.clear()
|
||||
is_silence = False
|
||||
last_active_time = time.time()
|
||||
print(f"Streaming started with speaker ID: {speaker_id}")
|
||||
await websocket.send_json({
|
||||
"type": "streaming_status",
|
||||
"status": "started"
|
||||
})
|
||||
|
||||
# Calculate audio energy for silence detection
|
||||
chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
|
||||
energy_window.append(chunk_energy)
|
||||
avg_energy = sum(energy_window) / len(energy_window)
|
||||
|
||||
# Debug audio levels
|
||||
if len(energy_window) >= 5: # Only start printing after we have enough samples
|
||||
if avg_energy > SILENCE_THRESHOLD:
|
||||
print(f"[AUDIO] Active sound detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
|
||||
else:
|
||||
print(f"[AUDIO] Silence detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
|
||||
|
||||
# Check if audio is silent
|
||||
current_silence = avg_energy < SILENCE_THRESHOLD
|
||||
|
||||
# Track silence transition
|
||||
if not is_silence and current_silence:
|
||||
# Transition to silence
|
||||
is_silence = True
|
||||
last_active_time = time.time()
|
||||
print("[STREAM] Transition to silence detected")
|
||||
elif is_silence and not current_silence:
|
||||
# User started talking again
|
||||
is_silence = False
|
||||
print("[STREAM] User resumed speaking")
|
||||
|
||||
# Add chunk to buffer regardless of silence state
|
||||
streaming_buffer.append(audio_chunk)
|
||||
|
||||
# Debug buffer size periodically
|
||||
if len(streaming_buffer) % 10 == 0:
|
||||
print(f"[BUFFER] Current size: {len(streaming_buffer)} chunks, ~{len(streaming_buffer)/5:.1f} seconds")
|
||||
|
||||
# Check if silence has persisted long enough to consider "stopped talking"
|
||||
silence_elapsed = time.time() - last_active_time
|
||||
|
||||
if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0:
|
||||
# User has stopped talking - process the collected audio
|
||||
print(f"[STREAM] Processing audio after {silence_elapsed:.2f}s of silence")
|
||||
print(f"[STREAM] Processing {len(streaming_buffer)} audio chunks (~{len(streaming_buffer)/5:.1f} seconds)")
|
||||
|
||||
full_audio = torch.cat(streaming_buffer, dim=0)
|
||||
|
||||
# Log audio statistics
|
||||
audio_duration = len(full_audio) / generator.sample_rate
|
||||
audio_min = torch.min(full_audio).item()
|
||||
audio_max = torch.max(full_audio).item()
|
||||
audio_mean = torch.mean(full_audio).item()
|
||||
print(f"[AUDIO] Processed audio - Duration: {audio_duration:.2f}s, Min: {audio_min:.4f}, Max: {audio_max:.4f}, Mean: {audio_mean:.4f}")
|
||||
|
||||
# Process with WhisperX speech-to-text
|
||||
print("[ASR] Starting transcription with WhisperX...")
|
||||
transcribed_text = await transcribe_audio(full_audio)
|
||||
|
||||
# Log the transcription
|
||||
print(f"[ASR] Transcribed text: '{transcribed_text}'")
|
||||
|
||||
# Add to conversation context
|
||||
if transcribed_text:
|
||||
print(f"[DIALOG] Adding user utterance to context: '{transcribed_text}'")
|
||||
user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
|
||||
context_segments.append(user_segment)
|
||||
|
||||
# Generate a contextual response
|
||||
print("[DIALOG] Generating response...")
|
||||
response_text = await generate_response(transcribed_text, context_segments)
|
||||
print(f"[DIALOG] Response text: '{response_text}'")
|
||||
|
||||
# Send the transcribed text to client
|
||||
await websocket.send_json({
|
||||
"type": "transcription",
|
||||
"text": transcribed_text
|
||||
})
|
||||
|
||||
# Generate audio for the response
|
||||
print("[TTS] Generating speech for response...")
|
||||
audio_tensor = generator.generate(
|
||||
text=response_text,
|
||||
speaker=1 if speaker_id == 0 else 0, # Use opposite speaker
|
||||
context=context_segments,
|
||||
max_audio_length_ms=10_000,
|
||||
)
|
||||
print(f"[TTS] Generated audio length: {len(audio_tensor)/generator.sample_rate:.2f}s")
|
||||
|
||||
# Add response to context
|
||||
ai_segment = Segment(
|
||||
text=response_text,
|
||||
speaker=1 if speaker_id == 0 else 0,
|
||||
audio=audio_tensor
|
||||
)
|
||||
context_segments.append(ai_segment)
|
||||
print(f"[DIALOG] Context now has {len(context_segments)} segments")
|
||||
|
||||
# Convert audio to base64 and send back to client
|
||||
audio_base64 = await encode_audio_data(audio_tensor)
|
||||
print("[STREAM] Sending audio response to client")
|
||||
await websocket.send_json({
|
||||
"type": "audio_response",
|
||||
"text": response_text,
|
||||
"audio": audio_base64
|
||||
})
|
||||
else:
|
||||
print("[ASR] Transcription failed or returned empty text")
|
||||
# If transcription failed, send a generic response
|
||||
await websocket.send_json({
|
||||
"type": "error",
|
||||
"message": "Sorry, I couldn't understand what you said. Could you try again?"
|
||||
})
|
||||
|
||||
# Clear buffer and reset silence detection
|
||||
streaming_buffer = []
|
||||
energy_window.clear()
|
||||
is_silence = False
|
||||
last_active_time = time.time()
|
||||
print("[STREAM] Buffer cleared, ready for next utterance")
|
||||
|
||||
# If buffer gets too large without silence, process it anyway
|
||||
# This prevents memory issues with very long streams
|
||||
elif len(streaming_buffer) >= 30: # ~6 seconds of audio at 5 chunks/sec
|
||||
print("[BUFFER] Maximum buffer size reached, processing audio")
|
||||
full_audio = torch.cat(streaming_buffer, dim=0)
|
||||
|
||||
# Process with WhisperX speech-to-text
|
||||
print("[ASR] Starting forced transcription of long audio...")
|
||||
transcribed_text = await transcribe_audio(full_audio)
|
||||
|
||||
if transcribed_text:
|
||||
print(f"[ASR] Transcribed long audio: '{transcribed_text}'")
|
||||
context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio))
|
||||
|
||||
# Send the transcribed text to client
|
||||
await websocket.send_json({
|
||||
"type": "transcription",
|
||||
"text": transcribed_text + " (processing continued speech...)"
|
||||
})
|
||||
else:
|
||||
print("[ASR] No transcription from long audio")
|
||||
|
||||
streaming_buffer = []
|
||||
print("[BUFFER] Buffer cleared due to size limit")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Processing streaming audio: {str(e)}")
|
||||
# Print traceback for more detailed error information
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
await websocket.send_json({
|
||||
"type": "error",
|
||||
"message": f"Error processing streaming audio: {str(e)}"
|
||||
})
|
||||
|
||||
elif action == "stop_streaming":
|
||||
is_streaming = False
|
||||
if streaming_buffer and len(streaming_buffer) > 5: # Only process if there's meaningful audio
|
||||
# Process any remaining audio in the buffer
|
||||
full_audio = torch.cat(streaming_buffer, dim=0)
|
||||
|
||||
# Process with WhisperX speech-to-text
|
||||
transcribed_text = await transcribe_audio(full_audio)
|
||||
|
||||
if transcribed_text:
|
||||
context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio))
|
||||
|
||||
# Send the transcribed text to client
|
||||
await websocket.send_json({
|
||||
"type": "transcription",
|
||||
"text": transcribed_text
|
||||
})
|
||||
|
||||
streaming_buffer = []
|
||||
await websocket.send_json({
|
||||
"type": "streaming_status",
|
||||
"status": "stopped"
|
||||
})
|
||||
|
||||
except WebSocketDisconnect:
|
||||
manager.disconnect(websocket)
|
||||
print("Client disconnected")
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
try:
|
||||
await websocket.send_json({
|
||||
"type": "error",
|
||||
"message": str(e)
|
||||
})
|
||||
except:
|
||||
pass
|
||||
manager.disconnect(websocket)
|
||||
print(f"Error generating audio: {str(e)}")
|
||||
emit('error', {
|
||||
'type': 'error',
|
||||
'message': f"Error generating audio: {str(e)}"
|
||||
})
|
||||
|
||||
@socketio.on('add_to_context')
|
||||
def handle_add_to_context(data):
|
||||
client_id = request.sid
|
||||
if client_id not in active_clients:
|
||||
emit('error', {'message': 'Client not registered'})
|
||||
return
|
||||
|
||||
try:
|
||||
text = data.get('text', '')
|
||||
speaker_id = data.get('speaker', 0)
|
||||
audio_data = data.get('audio', '')
|
||||
|
||||
# Convert received audio to tensor
|
||||
audio_tensor = decode_audio_data(audio_data)
|
||||
|
||||
# Add to conversation context
|
||||
active_clients[client_id]['context_segments'].append(
|
||||
Segment(text=text, speaker=speaker_id, audio=audio_tensor)
|
||||
)
|
||||
|
||||
emit('context_updated', {
|
||||
'type': 'context_updated',
|
||||
'message': 'Audio added to context'
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error adding to context: {str(e)}")
|
||||
emit('error', {
|
||||
'type': 'error',
|
||||
'message': f"Error processing audio: {str(e)}"
|
||||
})
|
||||
|
||||
@socketio.on('clear_context')
|
||||
def handle_clear_context():
|
||||
client_id = request.sid
|
||||
if client_id in active_clients:
|
||||
active_clients[client_id]['context_segments'] = []
|
||||
|
||||
emit('context_updated', {
|
||||
'type': 'context_updated',
|
||||
'message': 'Context cleared'
|
||||
})
|
||||
|
||||
@socketio.on('stream_audio')
|
||||
def handle_stream_audio(data):
|
||||
client_id = request.sid
|
||||
if client_id not in active_clients:
|
||||
emit('error', {'message': 'Client not registered'})
|
||||
return
|
||||
|
||||
client = active_clients[client_id]
|
||||
|
||||
try:
|
||||
speaker_id = data.get('speaker', 0)
|
||||
audio_data = data.get('audio', '')
|
||||
|
||||
# Convert received audio to tensor
|
||||
audio_chunk = decode_audio_data(audio_data)
|
||||
|
||||
# Start streaming mode if not already started
|
||||
if not client['is_streaming']:
|
||||
client['is_streaming'] = True
|
||||
client['streaming_buffer'] = []
|
||||
client['energy_window'].clear()
|
||||
client['is_silence'] = False
|
||||
client['last_active_time'] = time.time()
|
||||
print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}")
|
||||
emit('streaming_status', {
|
||||
'type': 'streaming_status',
|
||||
'status': 'started'
|
||||
})
|
||||
|
||||
# Calculate audio energy for silence detection
|
||||
chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
|
||||
client['energy_window'].append(chunk_energy)
|
||||
avg_energy = sum(client['energy_window']) / len(client['energy_window'])
|
||||
|
||||
# Check if audio is silent
|
||||
current_silence = avg_energy < SILENCE_THRESHOLD
|
||||
|
||||
# Track silence transition
|
||||
if not client['is_silence'] and current_silence:
|
||||
# Transition to silence
|
||||
client['is_silence'] = True
|
||||
client['last_active_time'] = time.time()
|
||||
elif client['is_silence'] and not current_silence:
|
||||
# User started talking again
|
||||
client['is_silence'] = False
|
||||
|
||||
# Add chunk to buffer regardless of silence state
|
||||
client['streaming_buffer'].append(audio_chunk)
|
||||
|
||||
# Check if silence has persisted long enough to consider "stopped talking"
|
||||
silence_elapsed = time.time() - client['last_active_time']
|
||||
|
||||
if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0:
|
||||
# User has stopped talking - process the collected audio
|
||||
print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence")
|
||||
|
||||
full_audio = torch.cat(client['streaming_buffer'], dim=0)
|
||||
|
||||
# Process with WhisperX speech-to-text
|
||||
print(f"[{client_id}] Starting transcription with WhisperX...")
|
||||
transcribed_text = transcribe_audio(full_audio)
|
||||
|
||||
# Log the transcription
|
||||
print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
|
||||
|
||||
# Add to conversation context
|
||||
if transcribed_text:
|
||||
user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
|
||||
client['context_segments'].append(user_segment)
|
||||
|
||||
# Generate a contextual response
|
||||
response_text = generate_response(transcribed_text, client['context_segments'])
|
||||
|
||||
# Send the transcribed text to client
|
||||
emit('transcription', {
|
||||
'type': 'transcription',
|
||||
'text': transcribed_text
|
||||
})
|
||||
|
||||
# Generate audio for the response
|
||||
audio_tensor = generator.generate(
|
||||
text=response_text,
|
||||
speaker=1 if speaker_id == 0 else 0, # Use opposite speaker
|
||||
context=client['context_segments'],
|
||||
max_audio_length_ms=10_000,
|
||||
)
|
||||
|
||||
# Add response to context
|
||||
ai_segment = Segment(
|
||||
text=response_text,
|
||||
speaker=1 if speaker_id == 0 else 0,
|
||||
audio=audio_tensor
|
||||
)
|
||||
client['context_segments'].append(ai_segment)
|
||||
|
||||
# Convert audio to base64 and send back to client
|
||||
audio_base64 = encode_audio_data(audio_tensor)
|
||||
emit('audio_response', {
|
||||
'type': 'audio_response',
|
||||
'text': response_text,
|
||||
'audio': audio_base64
|
||||
})
|
||||
else:
|
||||
# If transcription failed, send a generic response
|
||||
emit('error', {
|
||||
'type': 'error',
|
||||
'message': "Sorry, I couldn't understand what you said. Could you try again?"
|
||||
})
|
||||
|
||||
# Clear buffer and reset silence detection
|
||||
client['streaming_buffer'] = []
|
||||
client['energy_window'].clear()
|
||||
client['is_silence'] = False
|
||||
client['last_active_time'] = time.time()
|
||||
|
||||
# If buffer gets too large without silence, process it anyway
|
||||
elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec
|
||||
full_audio = torch.cat(client['streaming_buffer'], dim=0)
|
||||
|
||||
# Process with WhisperX speech-to-text
|
||||
transcribed_text = transcribe_audio(full_audio)
|
||||
|
||||
if transcribed_text:
|
||||
client['context_segments'].append(
|
||||
Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
|
||||
)
|
||||
|
||||
# Send the transcribed text to client
|
||||
emit('transcription', {
|
||||
'type': 'transcription',
|
||||
'text': transcribed_text + " (processing continued speech...)"
|
||||
})
|
||||
|
||||
client['streaming_buffer'] = []
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(f"Error processing streaming audio: {str(e)}")
|
||||
emit('error', {
|
||||
'type': 'error',
|
||||
'message': f"Error processing streaming audio: {str(e)}"
|
||||
})
|
||||
|
||||
@socketio.on('stop_streaming')
|
||||
def handle_stop_streaming(data):
|
||||
client_id = request.sid
|
||||
if client_id not in active_clients:
|
||||
return
|
||||
|
||||
client = active_clients[client_id]
|
||||
client['is_streaming'] = False
|
||||
|
||||
if client['streaming_buffer'] and len(client['streaming_buffer']) > 5:
|
||||
# Process any remaining audio in the buffer
|
||||
full_audio = torch.cat(client['streaming_buffer'], dim=0)
|
||||
|
||||
# Process with WhisperX speech-to-text
|
||||
transcribed_text = transcribe_audio(full_audio)
|
||||
|
||||
if transcribed_text:
|
||||
client['context_segments'].append(
|
||||
Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio)
|
||||
)
|
||||
|
||||
# Send the transcribed text to client
|
||||
emit('transcription', {
|
||||
'type': 'transcription',
|
||||
'text': transcribed_text
|
||||
})
|
||||
|
||||
client['streaming_buffer'] = []
|
||||
emit('streaming_status', {
|
||||
'type': 'streaming_status',
|
||||
'status': 'stopped'
|
||||
})
|
||||
|
||||
# Update the __main__ block with a comprehensive server startup message
|
||||
if __name__ == "__main__":
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🔊 Sesame AI Voice Chat Server")
|
||||
print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
|
||||
print(f"{'='*60}")
|
||||
print(f"📡 Server Information:")
|
||||
print(f" - Local URL: http://localhost:8000")
|
||||
print(f" - Network URL: http://<your-ip-address>:8000")
|
||||
print(f" - WebSocket: ws://<your-ip-address>:8000/ws")
|
||||
print(f" - Local URL: http://localhost:5000")
|
||||
print(f" - Network URL: http://<your-ip-address>:5000")
|
||||
print(f" - WebSocket: ws://<your-ip-address>:5000/socket.io")
|
||||
print(f"{'='*60}")
|
||||
print(f"💡 To make this server public:")
|
||||
print(f" 1. Ensure port 8000 is open in your firewall")
|
||||
print(f" 2. Set up port forwarding on your router to port 8000")
|
||||
print(f" 3. Or use a service like ngrok with: ngrok http 8000")
|
||||
print(f" 1. Ensure port 5000 is open in your firewall")
|
||||
print(f" 2. Set up port forwarding on your router to port 5000")
|
||||
print(f" 3. Or use a service like ngrok with: ngrok http 5000")
|
||||
print(f"{'='*60}")
|
||||
print(f"🌐 Device: {device.upper()}")
|
||||
print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})")
|
||||
@@ -503,5 +469,4 @@ if __name__ == "__main__":
|
||||
print(f"{'='*60}")
|
||||
print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")
|
||||
|
||||
# Start the server
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
socketio.run(app, host="0.0.0.0", port=5000, debug=False)
|
||||
Reference in New Issue
Block a user