Server and Client Side update

This commit is contained in:
2025-03-29 22:48:24 -04:00
parent e1f976eaca
commit 08fec9c403
2 changed files with 655 additions and 607 deletions

View File

@@ -1,9 +1,13 @@
/Backend/index.html -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Sesame AI Voice Chat</title>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
<!-- Socket.IO client library -->
<script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
<style>
body {
font-family: 'Arial', sans-serif;
@@ -11,6 +15,12 @@
margin: 0 auto;
padding: 20px;
background-color: #f9f9f9;
color: #333;
}
h1 {
text-align: center;
margin-bottom: 20px;
color: #1a73e8;
}
.conversation {
border: 1px solid #ddd;
@@ -21,6 +31,7 @@
margin-bottom: 20px;
background-color: white;
box-shadow: 0 2px 10px rgba(0,0,0,0.05);
scroll-behavior: smooth;
}
.message {
margin-bottom: 15px;
@@ -28,6 +39,7 @@
border-radius: 12px;
max-width: 80%;
line-height: 1.4;
animation: message-appear 0.3s ease-out;
}
.user {
background-color: #e3f2fd;
@@ -55,6 +67,7 @@
gap: 15px;
justify-content: center;
align-items: center;
margin-bottom: 15px;
}
button {
padding: 12px 24px;
@@ -66,11 +79,20 @@
font-weight: bold;
transition: all 0.2s ease;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
}
button:hover {
background-color: #45a049;
box-shadow: 0 4px 8px rgba(0,0,0,0.15);
}
button:disabled {
background-color: #cccccc;
cursor: not-allowed;
opacity: 0.7;
}
.recording {
background-color: #f44336;
animation: pulse 1.5s infinite;
@@ -94,6 +116,10 @@
50% { opacity: 0.7; }
100% { opacity: 1; }
}
@keyframes message-appear {
from { opacity: 0; transform: translateY(10px); }
to { opacity: 1; transform: translateY(0); }
}
.status-indicator {
display: flex;
align-items: center;
@@ -106,6 +132,7 @@
height: 10px;
border-radius: 50%;
background-color: #ccc;
transition: background-color 0.3s ease;
}
.status-dot.active {
background-color: #4CAF50;
@@ -117,6 +144,7 @@
audio {
width: 100%;
margin-top: 5px;
border-radius: 8px;
}
.visualizer-container {
width: 100%;
@@ -126,14 +154,13 @@
margin-bottom: 15px;
overflow: hidden;
position: relative;
box-shadow: inset 0 1px 3px rgba(0,0,0,0.1);
}
.audio-visualizer {
width: 100%;
height: 100%;
display: block;
}
.visualizer-label {
position: absolute;
top: 50%;
@@ -145,6 +172,21 @@
opacity: 0.7;
text-align: center;
width: 100%;
transition: opacity 0.3s ease;
}
.conversation::-webkit-scrollbar {
width: 8px;
}
.conversation::-webkit-scrollbar-track {
background: #f1f1f1;
border-radius: 10px;
}
.conversation::-webkit-scrollbar-thumb {
background: #ccc;
border-radius: 10px;
}
.conversation::-webkit-scrollbar-thumb:hover {
background: #aaa;
}
</style>
</head>
@@ -162,8 +204,8 @@
<option value="0">Speaker 0</option>
<option value="1">Speaker 1</option>
</select>
<button id="streamButton">Start Conversation</button>
<button id="clearButton">Clear Chat</button>
<button id="streamButton"><i class="fas fa-microphone"></i> Start Conversation</button>
<button id="clearButton"><i class="fas fa-trash"></i> Clear Chat</button>
</div>
<div class="status-indicator">
@@ -173,7 +215,7 @@
<script>
// Variables
let ws;
let socket;
let audioContext;
let streamProcessor;
let isStreaming = false;
@@ -184,14 +226,13 @@
const CLIENT_SILENCE_THRESHOLD = 0.01;
const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second
// Add these variables with your existing ones
// Visualizer variables
let analyser;
let visualizerCanvas;
let canvasContext;
let visualizerBufferLength;
let visualizerDataArray;
let visualizerAnimationFrame;
const visualizerLabel = document.getElementById('visualizerLabel');
// DOM elements
const conversationEl = document.getElementById('conversation');
@@ -200,93 +241,150 @@
const clearButton = document.getElementById('clearButton');
const statusDot = document.getElementById('statusDot');
const statusText = document.getElementById('statusText');
const visualizerLabel = document.getElementById('visualizerLabel');
// Initialize on page load
window.addEventListener('load', () => {
connectWebSocket();
// Initialize audio context
setupAudioContext();
// Setup visualization
setupVisualizer();
// Event listeners
// Connect to Socket.IO server
connectSocketIO();
// Add event listeners
streamButton.addEventListener('click', toggleStreaming);
clearButton.addEventListener('click', clearConversation);
});
// Setup audio context for streaming
// Setup audio context
function setupAudioContext() {
try {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
console.log('Audio context setup completed');
console.log('Audio context initialized');
} catch (err) {
console.error('Error setting up audio context:', err);
addSystemMessage(`Audio context error: ${err.message}`);
streamButton.disabled = true;
}
}
// Connect to WebSocket server
function connectWebSocket() {
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsUrl = `${wsProtocol}//${window.location.hostname}:8000/ws`;
// Setup the audio visualizer
function setupVisualizer() {
visualizerCanvas = document.getElementById('audioVisualizer');
canvasContext = visualizerCanvas.getContext('2d');
ws = new WebSocket(wsUrl);
// Set canvas size to match container
function resizeCanvas() {
const container = visualizerCanvas.parentElement;
visualizerCanvas.width = container.clientWidth;
visualizerCanvas.height = container.clientHeight;
}
ws.onopen = () => {
console.log('WebSocket connected');
// Call initially and on window resize
resizeCanvas();
window.addEventListener('resize', resizeCanvas);
// Create placeholder data array
visualizerBufferLength = 128;
visualizerDataArray = new Uint8Array(visualizerBufferLength);
}
// Connect to Socket.IO server
function connectSocketIO() {
// Use the server URL with or without a specific port
const serverUrl = window.location.origin;
console.log(`Connecting to Socket.IO server at ${serverUrl}`);
socket = io(serverUrl, {
reconnectionDelay: 1000,
reconnectionDelayMax: 5000,
reconnectionAttempts: Infinity
});
// Socket.IO event handlers
socket.on('connect', () => {
console.log('Connected to Socket.IO server');
statusDot.classList.add('active');
statusText.textContent = 'Connected';
addSystemMessage('Connected to server');
};
streamButton.disabled = false;
});
ws.onmessage = (event) => {
const response = JSON.parse(event.data);
console.log('Received:', response);
if (response.type === 'audio_response') {
// Play audio response
const audio = new Audio(response.audio);
audio.play();
// Add message to conversation
addAIMessage(response.text || 'AI response', response.audio);
// Reset to speaking state after AI response
if (isStreaming) {
streamButton.textContent = 'Listening...';
streamButton.style.backgroundColor = '#f44336'; // Back to red
streamButton.classList.add('recording');
isSpeaking = false; // Reset speaking state
}
} else if (response.type === 'error') {
addSystemMessage(`Error: ${response.message}`);
} else if (response.type === 'context_updated') {
addSystemMessage(response.message);
} else if (response.type === 'streaming_status') {
addSystemMessage(`Streaming ${response.status}`);
} else if (response.type === 'transcription') {
addUserTranscription(response.text);
}
};
ws.onclose = () => {
console.log('WebSocket disconnected');
socket.on('disconnect', () => {
console.log('Disconnected from Socket.IO server');
statusDot.classList.remove('active');
statusText.textContent = 'Disconnected';
addSystemMessage('Disconnected from server. Reconnecting...');
setTimeout(connectWebSocket, 3000);
};
addSystemMessage('Disconnected from server');
streamButton.disabled = true;
ws.onerror = (error) => {
console.error('WebSocket error:', error);
// Stop streaming if active
if (isStreaming) {
stopStreaming(false); // false = don't send to server
}
});
socket.on('status', (data) => {
console.log('Status update:', data);
addSystemMessage(data.message);
});
socket.on('error', (data) => {
console.error('Server error:', data);
addSystemMessage(`Error: ${data.message}`);
});
socket.on('audio_response', (data) => {
console.log('Received audio response');
// Play audio response
const audio = new Audio(data.audio);
audio.play();
// Add message to conversation
addAIMessage(data.text || 'AI response', data.audio);
// Reset UI state after AI response
if (isStreaming) {
streamButton.textContent = 'Listening...';
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
streamButton.style.backgroundColor = '#f44336';
streamButton.classList.add('recording');
streamButton.classList.remove('processing');
isSpeaking = false; // Reset speaking state
}
});
socket.on('transcription', (data) => {
console.log('Received transcription:', data);
addUserTranscription(data.text);
});
socket.on('context_updated', (data) => {
console.log('Context updated:', data);
addSystemMessage(data.message);
});
socket.on('streaming_status', (data) => {
console.log('Streaming status:', data);
addSystemMessage(`Streaming ${data.status}`);
});
socket.on('connect_error', (error) => {
console.error('Connection error:', error);
statusDot.classList.remove('active');
statusText.textContent = 'Error';
addSystemMessage('Connection error');
};
statusText.textContent = 'Connection Error';
addSystemMessage('Failed to connect to server');
streamButton.disabled = true;
});
}
// Toggle streaming
function toggleStreaming() {
if (isStreaming) {
stopStreaming();
stopStreaming(true); // true = send to server
} else {
startStreaming();
}
@@ -295,49 +393,52 @@
// Start streaming
async function startStreaming() {
try {
// Request microphone access
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const speaker = parseInt(speakerSelectEl.value);
// Update state
isStreaming = true;
isSpeaking = false;
energyWindow = [];
streamButton.textContent = 'Listening...';
// Update UI
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
streamButton.classList.add('recording');
// Create audio processor node
// Setup audio analysis
const source = audioContext.createMediaStreamSource(stream);
// Set up analyser for visualization with better settings
// Setup analyzer for visualization
analyser = audioContext.createAnalyser();
analyser.fftSize = 256;
analyser.smoothingTimeConstant = 0.8; // Add smoothing for nicer visualization
analyser.smoothingTimeConstant = 0.8;
analyser.minDecibels = -90;
analyser.maxDecibels = -10;
visualizerBufferLength = analyser.frequencyBinCount;
visualizerDataArray = new Uint8Array(visualizerBufferLength);
// Connect source to analyzer first
// Connect source to analyzer
source.connect(analyser);
// Hide the label when visualization is active
// Hide visualizer label
visualizerLabel.style.opacity = '0';
// Start drawing the visualization
// Start visualization
if (visualizerAnimationFrame) {
cancelAnimationFrame(visualizerAnimationFrame);
}
drawVisualizer();
// Set up processor for audio processing
// Setup audio processor
streamProcessor = audioContext.createScriptProcessor(4096, 1, 1);
// Connect nodes
// Connect audio nodes
source.connect(streamProcessor);
streamProcessor.connect(audioContext.destination);
// Process and send audio data
// Process audio
streamProcessor.onaudioprocess = function(e) {
const audioData = e.inputBuffer.getChannelData(0);
@@ -349,10 +450,10 @@
const avgEnergy = calculateAverageEnergy();
const isSilent = avgEnergy < CLIENT_SILENCE_THRESHOLD;
// Handle silence/speech transitions for visual feedback
// Handle silence/speech transitions
handleSpeechState(isSilent);
// Continue processing audio regardless of silence state
// Process and send audio
const downsampled = downsampleBuffer(audioData, audioContext.sampleRate, 24000);
sendAudioChunk(downsampled, speaker);
};
@@ -363,8 +464,71 @@
console.error('Error starting audio stream:', err);
addSystemMessage(`Microphone error: ${err.message}`);
isStreaming = false;
streamButton.textContent = 'Start Conversation';
streamButton.classList.remove('recording');
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
streamButton.classList.remove('recording', 'processing');
}
}
// Stop streaming
function stopStreaming(sendToServer = true) {
// Disconnect audio nodes
if (streamProcessor) {
streamProcessor.disconnect();
streamProcessor = null;
}
if (analyser) {
analyser.disconnect();
analyser = null;
}
// Stop visualization
if (visualizerAnimationFrame) {
cancelAnimationFrame(visualizerAnimationFrame);
visualizerAnimationFrame = null;
}
// Clear canvas
if (canvasContext) {
canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
visualizerLabel.style.opacity = '0.7';
}
// Clear silence timer
if (silenceTimer) {
clearTimeout(silenceTimer);
silenceTimer = null;
}
// Reset state
isStreaming = false;
isSpeaking = false;
energyWindow = [];
// Update UI
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
streamButton.classList.remove('recording', 'processing');
streamButton.style.backgroundColor = '';
addSystemMessage('Conversation paused');
// Notify server
if (sendToServer && socket.connected) {
socket.emit('stop_streaming', {
speaker: parseInt(speakerSelectEl.value)
});
}
}
// Clear conversation
function clearConversation() {
// Clear UI
conversationEl.innerHTML = '';
addSystemMessage('Conversation cleared');
// Notify server
if (socket.connected) {
socket.emit('clear_context');
}
}
@@ -377,7 +541,7 @@
return sum / buffer.length;
}
// Update the sliding energy window
// Update energy window
function updateEnergyWindow(energy) {
energyWindow.push(energy);
if (energyWindow.length > ENERGY_WINDOW_SIZE) {
@@ -385,20 +549,20 @@
}
}
// Calculate average energy from the window
// Calculate average energy
function calculateAverageEnergy() {
if (energyWindow.length === 0) return 0;
return energyWindow.reduce((sum, val) => sum + val, 0) / energyWindow.length;
}
// Handle speech state changes and visual feedback
// Handle speech state changes
function handleSpeechState(isSilent) {
if (isSpeaking && isSilent) {
// Transition from speaking to silence
if (!silenceTimer) {
silenceTimer = setTimeout(() => {
// Silence persisted long enough
streamButton.textContent = 'Processing...';
streamButton.innerHTML = '<i class="fas fa-cog fa-spin"></i> Processing...';
streamButton.classList.remove('recording');
streamButton.classList.add('processing');
addSystemMessage('Detected pause in speech, processing response...');
@@ -407,24 +571,24 @@
} else if (!isSpeaking && !isSilent) {
// Transition from silence to speaking
isSpeaking = true;
streamButton.textContent = 'Listening...';
streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
streamButton.classList.add('recording');
streamButton.classList.remove('processing');
// Clear any pending silence timer
// Clear silence timer
if (silenceTimer) {
clearTimeout(silenceTimer);
silenceTimer = null;
}
} else if (isSpeaking && !isSilent) {
// Still speaking, reset any silence timer
// Still speaking, reset silence timer
if (silenceTimer) {
clearTimeout(silenceTimer);
silenceTimer = null;
}
}
// Update speaking state
// Update speaking state for non-silent audio
if (!isSilent) {
isSpeaking = true;
}
@@ -432,83 +596,93 @@
// Send audio chunk to server
function sendAudioChunk(audioData, speaker) {
if (!socket || !socket.connected) {
console.warn('Cannot send audio: socket not connected');
return;
}
const wavData = createWavBlob(audioData, 24000);
const reader = new FileReader();
reader.onloadend = function() {
const base64data = reader.result;
// Send to server
ws.send(JSON.stringify({
action: 'stream_audio',
// Send to server using Socket.IO
socket.emit('stream_audio', {
speaker: speaker,
audio: base64data
}));
});
};
reader.readAsDataURL(wavData);
}
// Stop streaming
function stopStreaming() {
if (streamProcessor) {
streamProcessor.disconnect();
streamProcessor = null;
// Visualization function
function drawVisualizer() {
if (!canvasContext) {
console.error("Canvas context not available");
return;
}
if (analyser) {
analyser.disconnect();
analyser = null;
visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
// Get frequency data if available
if (isStreaming && analyser) {
try {
analyser.getByteFrequencyData(visualizerDataArray);
} catch (e) {
console.error("Error getting frequency data:", e);
}
} else {
// Fade out when not streaming
for (let i = 0; i < visualizerDataArray.length; i++) {
visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
}
}
// Stop the visualization
if (visualizerAnimationFrame) {
cancelAnimationFrame(visualizerAnimationFrame);
visualizerAnimationFrame = null;
// Clear canvas
canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
// Draw bars
const width = visualizerCanvas.width;
const height = visualizerCanvas.height;
const barCount = Math.min(visualizerBufferLength, 64);
const barWidth = width / barCount - 1;
for (let i = 0; i < barCount; i++) {
const index = Math.floor(i * visualizerBufferLength / barCount);
const value = visualizerDataArray[index];
const barHeight = (value / 255) * height;
const x = i * (barWidth + 1);
// Color based on frequency
const hue = 200 + (i / barCount * 60);
const saturation = 90 - (value / 255 * 30);
const lightness = 40 + (value / 255 * 30);
// Draw bar
canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
// Add reflection effect
const gradientHeight = Math.min(10, barHeight / 3);
const gradient = canvasContext.createLinearGradient(
0, height - barHeight,
0, height - barHeight + gradientHeight
);
gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
canvasContext.fillStyle = gradient;
canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
}
// Clear the canvas
if (canvasContext) {
canvasContext.clearRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
visualizerLabel.style.opacity = '0.7';
}
// Clear any pending silence timer
if (silenceTimer) {
clearTimeout(silenceTimer);
silenceTimer = null;
}
isStreaming = false;
isSpeaking = false;
energyWindow = [];
streamButton.textContent = 'Start Conversation';
streamButton.classList.remove('recording', 'processing');
streamButton.style.backgroundColor = ''; // Reset to default
addSystemMessage('Conversation paused');
// Send stop streaming signal to server
ws.send(JSON.stringify({
action: 'stop_streaming',
speaker: parseInt(speakerSelectEl.value)
}));
// Show/hide the label
visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
}
// Clear conversation
function clearConversation() {
// Clear conversation history
ws.send(JSON.stringify({
action: 'clear_context'
}));
// Clear the UI
conversationEl.innerHTML = '';
addSystemMessage('Conversation cleared');
}
// Downsample audio buffer to target sample rate
// Downsample audio buffer
function downsampleBuffer(buffer, sampleRate, targetSampleRate) {
if (targetSampleRate === sampleRate) {
return buffer;
@@ -538,7 +712,7 @@
return result;
}
// Create WAV blob from Float32Array
// Create WAV blob
function createWavBlob(samples, sampleRate) {
const buffer = new ArrayBuffer(44 + samples.length * 2);
const view = new DataView(buffer);
@@ -562,8 +736,7 @@
writeString(view, 36, 'data');
view.setUint32(40, samples.length * 2, true);
// Write the PCM samples
const volume = 0.5;
// Write PCM samples
for (let i = 0; i < samples.length; i++) {
const sample = Math.max(-1, Math.min(1, samples[i]));
view.setInt16(44 + i * 2, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true);
@@ -572,19 +745,19 @@
return new Blob([buffer], { type: 'audio/wav' });
}
// Write string to DataView
function writeString(view, offset, string) {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
}
// Message display functions
// Add user transcription
function addUserTranscription(text) {
// Find if there's already a pending user message
// Find or create user message
let pendingMessage = document.querySelector('.message.user.pending');
if (!pendingMessage) {
// Create a new message
pendingMessage = document.createElement('div');
pendingMessage.classList.add('message', 'user', 'pending');
conversationEl.appendChild(pendingMessage);
@@ -595,6 +768,7 @@
conversationEl.scrollTop = conversationEl.scrollHeight;
}
// Add AI message
function addAIMessage(text, audioSrc) {
const messageEl = document.createElement('div');
messageEl.classList.add('message', 'ai');
@@ -614,6 +788,7 @@
conversationEl.scrollTop = conversationEl.scrollHeight;
}
// Add system message
function addSystemMessage(text) {
const messageEl = document.createElement('div');
messageEl.classList.add('message', 'system');
@@ -621,98 +796,6 @@
conversationEl.appendChild(messageEl);
conversationEl.scrollTop = conversationEl.scrollHeight;
}
// Setup the audio visualizer
function setupVisualizer() {
visualizerCanvas = document.getElementById('audioVisualizer');
canvasContext = visualizerCanvas.getContext('2d');
// Set canvas size to match container
function resizeCanvas() {
const container = visualizerCanvas.parentElement;
visualizerCanvas.width = container.clientWidth;
visualizerCanvas.height = container.clientHeight;
}
// Call initially and on window resize
resizeCanvas();
window.addEventListener('resize', resizeCanvas);
// Create placeholder data array (will be used before streaming starts)
visualizerBufferLength = 128; // Default size
visualizerDataArray = new Uint8Array(visualizerBufferLength);
}
// Add the visualization drawing function
function drawVisualizer() {
// Ensure we have the canvas context
if (!canvasContext) {
console.error("Canvas context not available");
return;
}
visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
// If we're streaming and have an analyzer, get the frequency data
if (isStreaming && analyser) {
try {
analyser.getByteFrequencyData(visualizerDataArray);
} catch (e) {
console.error("Error getting frequency data:", e);
}
} else {
// If not streaming, gradually reduce all values to create a fade-out effect
for (let i = 0; i < visualizerDataArray.length; i++) {
visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
}
}
// Clear the canvas with a very slight background
canvasContext.fillStyle = 'rgba(245, 245, 245, 0.2)';
canvasContext.fillRect(0, 0, visualizerCanvas.width, visualizerCanvas.height);
// Calculate bar width based on canvas size and buffer length
const width = visualizerCanvas.width;
const height = visualizerCanvas.height;
const barCount = Math.min(visualizerBufferLength, 64); // Limit bars for performance
const barWidth = width / barCount - 1; // Leave 1px gap
// Draw bars
for (let i = 0; i < barCount; i++) {
// Use a logarithmic scale for better visualization of lower frequencies
const index = Math.floor(i * visualizerBufferLength / barCount);
const value = visualizerDataArray[index];
// Scale height (values typically range from 0-255)
const barHeight = (value / 255) * height;
// Position x coordinate
const x = i * (barWidth + 1);
// Calculate gradient color based on frequency
const hue = 200 + (i / barCount * 60); // Blue to light-blue/cyan spectrum
const saturation = 90 - (value / 255 * 30); // More saturated for louder sounds
const lightness = 40 + (value / 255 * 30); // Brighter for louder sounds
// Draw the bar
canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
canvasContext.fillRect(x, height - barHeight, barWidth, barHeight);
// Add a subtle reflection
const gradientHeight = Math.min(10, barHeight / 3);
const gradient = canvasContext.createLinearGradient(
0, height - barHeight,
0, height - barHeight + gradientHeight
);
gradient.addColorStop(0, 'rgba(255, 255, 255, 0.3)');
gradient.addColorStop(1, 'rgba(255, 255, 255, 0)');
canvasContext.fillStyle = gradient;
canvasContext.fillRect(x, height - barHeight, barWidth, gradientHeight);
}
// Only show the label when not streaming
visualizerLabel.style.opacity = isStreaming ? '0' : '0.7';
}
</script>
</body>
</html>

View File

@@ -1,24 +1,20 @@
import os
import base64
import json
import asyncio
import torch
import torchaudio
import numpy as np
import io
import whisperx
from io import BytesIO
from typing import List, Dict, Any, Optional
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
from fastapi.responses import HTMLResponse, FileResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from flask import Flask, request, send_from_directory, Response
from flask_cors import CORS
from flask_socketio import SocketIO, emit, disconnect
from generator import load_csm_1b, Segment
import uvicorn
import time
import gc
from collections import deque
from threading import Lock
# Select device
if torch.cuda.is_available():
@@ -36,73 +32,39 @@ print("Loading WhisperX model...")
asr_model = whisperx.load_model("medium", device, compute_type="float16")
print("WhisperX model loaded!")
app = FastAPI()
# Add CORS middleware to allow cross-origin requests
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins in development
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Silence detection parameters
SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization
SILENCE_DURATION_SEC = 1.0 # How long silence must persist
# Define the base directory
base_dir = os.path.dirname(os.path.abspath(__file__))
# Mount a static files directory if you have any static assets like CSS or JS
static_dir = os.path.join(base_dir, "static")
os.makedirs(static_dir, exist_ok=True) # Create the directory if it doesn't exist
app.mount("/static", StaticFiles(directory=static_dir), name="static")
os.makedirs(static_dir, exist_ok=True)
# Define route to serve index.html as the main page
@app.get("/", response_class=HTMLResponse)
async def get_index():
try:
with open(os.path.join(base_dir, "index.html"), "r") as f:
return HTMLResponse(content=f.read())
except FileNotFoundError:
return HTMLResponse(content="<html><body><h1>Error: index.html not found</h1></body></html>")
# Setup Flask
app = Flask(__name__)
CORS(app)
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
# Add a favicon endpoint (optional, but good to have)
@app.get("/favicon.ico")
async def get_favicon():
favicon_path = os.path.join(static_dir, "favicon.ico")
if os.path.exists(favicon_path):
return FileResponse(favicon_path)
else:
return HTMLResponse(status_code=204) # No content
# Connection manager to handle multiple clients
class ConnectionManager:
def __init__(self):
self.active_connections: List[WebSocket] = []
async def connect(self, websocket: WebSocket):
await websocket.accept()
self.active_connections.append(websocket)
def disconnect(self, websocket: WebSocket):
self.active_connections.remove(websocket)
manager = ConnectionManager()
# Silence detection parameters
SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization
SILENCE_DURATION_SEC = 1.0 # How long silence must persist to be considered "stopped talking"
# Socket connection management
thread = None
thread_lock = Lock()
active_clients = {} # Map client_id to client context
# Helper function to convert audio data
async def decode_audio_data(audio_data: str) -> torch.Tensor:
def decode_audio_data(audio_data: str) -> torch.Tensor:
"""Decode base64 audio data to a torch tensor"""
try:
# Extract the actual base64 content
if ',' in audio_data:
audio_data = audio_data.split(',')[1]
# Decode base64 audio data
binary_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data)
binary_data = base64.b64decode(audio_data)
# Save to a temporary WAV file first
temp_file = BytesIO(binary_data)
# Load audio from binary data, explicitly specifying the format
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
# Load audio from binary data
with BytesIO(binary_data) as temp_file:
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
# Resample if needed
if sample_rate != generator.sample_rate:
@@ -121,7 +83,7 @@ async def decode_audio_data(audio_data: str) -> torch.Tensor:
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
def encode_audio_data(audio_tensor: torch.Tensor) -> str:
"""Encode torch tensor audio to base64 string"""
buf = BytesIO()
torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
@@ -130,40 +92,36 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
return f"data:audio/wav;base64,{audio_base64}"
async def transcribe_audio(audio_tensor: torch.Tensor) -> str:
def transcribe_audio(audio_tensor: torch.Tensor) -> str:
"""Transcribe audio using WhisperX"""
try:
# Save the tensor to a temporary file
temp_file = BytesIO()
torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
temp_file.seek(0)
# Create a temporary file on disk (WhisperX requires a file path)
temp_path = "temp_audio.wav"
with open(temp_path, "wb") as f:
f.write(temp_file.read())
temp_path = os.path.join(base_dir, "temp_audio.wav")
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
# Load and transcribe the audio
audio = whisperx.load_audio(temp_path)
result = asr_model.transcribe(audio, batch_size=16)
# Clean up
os.remove(temp_path)
if os.path.exists(temp_path):
os.remove(temp_path)
# Get the transcription text
if result["segments"] and len(result["segments"]) > 0:
# Combine all segments
transcription = " ".join([segment["text"] for segment in result["segments"]])
print(f"Transcription: {transcription}")
return transcription.strip()
else:
return ""
except Exception as e:
print(f"Error in transcription: {str(e)}")
if os.path.exists("temp_audio.wav"):
os.remove("temp_audio.wav")
return ""
async def generate_response(text: str, conversation_history: List[Segment]) -> str:
def generate_response(text: str, conversation_history: List[Segment]) -> str:
"""Generate a contextual response based on the transcribed text"""
# Simple response logic - can be replaced with a more sophisticated LLM in the future
responses = {
@@ -191,311 +149,319 @@ async def generate_response(text: str, conversation_history: List[Segment]) -> s
else:
return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?"
# Flask routes for serving static content
@app.route('/')
def index():
return send_from_directory(base_dir, 'index.html')
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
await manager.connect(websocket)
context_segments = [] # Store conversation context
streaming_buffer = [] # Buffer for streaming audio chunks
is_streaming = False
@app.route('/favicon.ico')
def favicon():
if os.path.exists(os.path.join(static_dir, 'favicon.ico')):
return send_from_directory(static_dir, 'favicon.ico')
return Response(status=204)
# Variables for silence detection
last_active_time = time.time()
is_silence = False
energy_window = deque(maxlen=10) # For tracking recent audio energy
@app.route('/static/<path:path>')
def serve_static(path):
return send_from_directory(static_dir, path)
# Socket.IO event handlers
@socketio.on('connect')
def handle_connect():
client_id = request.sid
print(f"Client connected: {client_id}")
# Initialize client context
active_clients[client_id] = {
'context_segments': [],
'streaming_buffer': [],
'is_streaming': False,
'is_silence': False,
'last_active_time': time.time(),
'energy_window': deque(maxlen=10)
}
emit('status', {'type': 'connected', 'message': 'Connected to server'})
@socketio.on('disconnect')
def handle_disconnect():
client_id = request.sid
if client_id in active_clients:
del active_clients[client_id]
print(f"Client disconnected: {client_id}")
@socketio.on('generate')
def handle_generate(data):
client_id = request.sid
if client_id not in active_clients:
emit('error', {'message': 'Client not registered'})
return
try:
while True:
# Receive JSON data from client
data = await websocket.receive_text()
request = json.loads(data)
text = data.get('text', '')
speaker_id = data.get('speaker', 0)
action = request.get("action")
print(f"Generating audio for: '{text}' with speaker {speaker_id}")
if action == "generate":
try:
text = request.get("text", "")
speaker_id = request.get("speaker", 0)
# Generate audio response
audio_tensor = generator.generate(
text=text,
speaker=speaker_id,
context=active_clients[client_id]['context_segments'],
max_audio_length_ms=10_000,
)
# Generate audio response
print(f"Generating audio for: '{text}' with speaker {speaker_id}")
audio_tensor = generator.generate(
text=text,
speaker=speaker_id,
context=context_segments,
max_audio_length_ms=10_000,
)
# Add to conversation context
active_clients[client_id]['context_segments'].append(
Segment(text=text, speaker=speaker_id, audio=audio_tensor)
)
# Add to conversation context
context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
# Convert audio to base64 and send back to client
audio_base64 = encode_audio_data(audio_tensor)
emit('audio_response', {
'type': 'audio_response',
'audio': audio_base64
})
# Convert audio to base64 and send back to client
audio_base64 = await encode_audio_data(audio_tensor)
await websocket.send_json({
"type": "audio_response",
"audio": audio_base64
})
except Exception as e:
print(f"Error generating audio: {str(e)}")
await websocket.send_json({
"type": "error",
"message": f"Error generating audio: {str(e)}"
})
elif action == "add_to_context":
try:
text = request.get("text", "")
speaker_id = request.get("speaker", 0)
audio_data = request.get("audio", "")
# Convert received audio to tensor
audio_tensor = await decode_audio_data(audio_data)
# Add to conversation context
context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
await websocket.send_json({
"type": "context_updated",
"message": "Audio added to context"
})
except Exception as e:
print(f"Error adding to context: {str(e)}")
await websocket.send_json({
"type": "error",
"message": f"Error processing audio: {str(e)}"
})
elif action == "clear_context":
context_segments = []
await websocket.send_json({
"type": "context_updated",
"message": "Context cleared"
})
elif action == "stream_audio":
try:
speaker_id = request.get("speaker", 0)
audio_data = request.get("audio", "")
# Convert received audio to tensor
audio_chunk = await decode_audio_data(audio_data)
# Start streaming mode if not already started
if not is_streaming:
is_streaming = True
streaming_buffer = []
energy_window.clear()
is_silence = False
last_active_time = time.time()
print(f"Streaming started with speaker ID: {speaker_id}")
await websocket.send_json({
"type": "streaming_status",
"status": "started"
})
# Calculate audio energy for silence detection
chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
energy_window.append(chunk_energy)
avg_energy = sum(energy_window) / len(energy_window)
# Debug audio levels
if len(energy_window) >= 5: # Only start printing after we have enough samples
if avg_energy > SILENCE_THRESHOLD:
print(f"[AUDIO] Active sound detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
else:
print(f"[AUDIO] Silence detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})")
# Check if audio is silent
current_silence = avg_energy < SILENCE_THRESHOLD
# Track silence transition
if not is_silence and current_silence:
# Transition to silence
is_silence = True
last_active_time = time.time()
print("[STREAM] Transition to silence detected")
elif is_silence and not current_silence:
# User started talking again
is_silence = False
print("[STREAM] User resumed speaking")
# Add chunk to buffer regardless of silence state
streaming_buffer.append(audio_chunk)
# Debug buffer size periodically
if len(streaming_buffer) % 10 == 0:
print(f"[BUFFER] Current size: {len(streaming_buffer)} chunks, ~{len(streaming_buffer)/5:.1f} seconds")
# Check if silence has persisted long enough to consider "stopped talking"
silence_elapsed = time.time() - last_active_time
if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0:
# User has stopped talking - process the collected audio
print(f"[STREAM] Processing audio after {silence_elapsed:.2f}s of silence")
print(f"[STREAM] Processing {len(streaming_buffer)} audio chunks (~{len(streaming_buffer)/5:.1f} seconds)")
full_audio = torch.cat(streaming_buffer, dim=0)
# Log audio statistics
audio_duration = len(full_audio) / generator.sample_rate
audio_min = torch.min(full_audio).item()
audio_max = torch.max(full_audio).item()
audio_mean = torch.mean(full_audio).item()
print(f"[AUDIO] Processed audio - Duration: {audio_duration:.2f}s, Min: {audio_min:.4f}, Max: {audio_max:.4f}, Mean: {audio_mean:.4f}")
# Process with WhisperX speech-to-text
print("[ASR] Starting transcription with WhisperX...")
transcribed_text = await transcribe_audio(full_audio)
# Log the transcription
print(f"[ASR] Transcribed text: '{transcribed_text}'")
# Add to conversation context
if transcribed_text:
print(f"[DIALOG] Adding user utterance to context: '{transcribed_text}'")
user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
context_segments.append(user_segment)
# Generate a contextual response
print("[DIALOG] Generating response...")
response_text = await generate_response(transcribed_text, context_segments)
print(f"[DIALOG] Response text: '{response_text}'")
# Send the transcribed text to client
await websocket.send_json({
"type": "transcription",
"text": transcribed_text
})
# Generate audio for the response
print("[TTS] Generating speech for response...")
audio_tensor = generator.generate(
text=response_text,
speaker=1 if speaker_id == 0 else 0, # Use opposite speaker
context=context_segments,
max_audio_length_ms=10_000,
)
print(f"[TTS] Generated audio length: {len(audio_tensor)/generator.sample_rate:.2f}s")
# Add response to context
ai_segment = Segment(
text=response_text,
speaker=1 if speaker_id == 0 else 0,
audio=audio_tensor
)
context_segments.append(ai_segment)
print(f"[DIALOG] Context now has {len(context_segments)} segments")
# Convert audio to base64 and send back to client
audio_base64 = await encode_audio_data(audio_tensor)
print("[STREAM] Sending audio response to client")
await websocket.send_json({
"type": "audio_response",
"text": response_text,
"audio": audio_base64
})
else:
print("[ASR] Transcription failed or returned empty text")
# If transcription failed, send a generic response
await websocket.send_json({
"type": "error",
"message": "Sorry, I couldn't understand what you said. Could you try again?"
})
# Clear buffer and reset silence detection
streaming_buffer = []
energy_window.clear()
is_silence = False
last_active_time = time.time()
print("[STREAM] Buffer cleared, ready for next utterance")
# If buffer gets too large without silence, process it anyway
# This prevents memory issues with very long streams
elif len(streaming_buffer) >= 30: # ~6 seconds of audio at 5 chunks/sec
print("[BUFFER] Maximum buffer size reached, processing audio")
full_audio = torch.cat(streaming_buffer, dim=0)
# Process with WhisperX speech-to-text
print("[ASR] Starting forced transcription of long audio...")
transcribed_text = await transcribe_audio(full_audio)
if transcribed_text:
print(f"[ASR] Transcribed long audio: '{transcribed_text}'")
context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio))
# Send the transcribed text to client
await websocket.send_json({
"type": "transcription",
"text": transcribed_text + " (processing continued speech...)"
})
else:
print("[ASR] No transcription from long audio")
streaming_buffer = []
print("[BUFFER] Buffer cleared due to size limit")
except Exception as e:
print(f"[ERROR] Processing streaming audio: {str(e)}")
# Print traceback for more detailed error information
import traceback
traceback.print_exc()
await websocket.send_json({
"type": "error",
"message": f"Error processing streaming audio: {str(e)}"
})
elif action == "stop_streaming":
is_streaming = False
if streaming_buffer and len(streaming_buffer) > 5: # Only process if there's meaningful audio
# Process any remaining audio in the buffer
full_audio = torch.cat(streaming_buffer, dim=0)
# Process with WhisperX speech-to-text
transcribed_text = await transcribe_audio(full_audio)
if transcribed_text:
context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio))
# Send the transcribed text to client
await websocket.send_json({
"type": "transcription",
"text": transcribed_text
})
streaming_buffer = []
await websocket.send_json({
"type": "streaming_status",
"status": "stopped"
})
except WebSocketDisconnect:
manager.disconnect(websocket)
print("Client disconnected")
except Exception as e:
print(f"Error: {str(e)}")
try:
await websocket.send_json({
"type": "error",
"message": str(e)
})
except:
pass
manager.disconnect(websocket)
print(f"Error generating audio: {str(e)}")
emit('error', {
'type': 'error',
'message': f"Error generating audio: {str(e)}"
})
@socketio.on('add_to_context')
def handle_add_to_context(data):
client_id = request.sid
if client_id not in active_clients:
emit('error', {'message': 'Client not registered'})
return
try:
text = data.get('text', '')
speaker_id = data.get('speaker', 0)
audio_data = data.get('audio', '')
# Convert received audio to tensor
audio_tensor = decode_audio_data(audio_data)
# Add to conversation context
active_clients[client_id]['context_segments'].append(
Segment(text=text, speaker=speaker_id, audio=audio_tensor)
)
emit('context_updated', {
'type': 'context_updated',
'message': 'Audio added to context'
})
except Exception as e:
print(f"Error adding to context: {str(e)}")
emit('error', {
'type': 'error',
'message': f"Error processing audio: {str(e)}"
})
@socketio.on('clear_context')
def handle_clear_context():
client_id = request.sid
if client_id in active_clients:
active_clients[client_id]['context_segments'] = []
emit('context_updated', {
'type': 'context_updated',
'message': 'Context cleared'
})
@socketio.on('stream_audio')
def handle_stream_audio(data):
client_id = request.sid
if client_id not in active_clients:
emit('error', {'message': 'Client not registered'})
return
client = active_clients[client_id]
try:
speaker_id = data.get('speaker', 0)
audio_data = data.get('audio', '')
# Convert received audio to tensor
audio_chunk = decode_audio_data(audio_data)
# Start streaming mode if not already started
if not client['is_streaming']:
client['is_streaming'] = True
client['streaming_buffer'] = []
client['energy_window'].clear()
client['is_silence'] = False
client['last_active_time'] = time.time()
print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}")
emit('streaming_status', {
'type': 'streaming_status',
'status': 'started'
})
# Calculate audio energy for silence detection
chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
client['energy_window'].append(chunk_energy)
avg_energy = sum(client['energy_window']) / len(client['energy_window'])
# Check if audio is silent
current_silence = avg_energy < SILENCE_THRESHOLD
# Track silence transition
if not client['is_silence'] and current_silence:
# Transition to silence
client['is_silence'] = True
client['last_active_time'] = time.time()
elif client['is_silence'] and not current_silence:
# User started talking again
client['is_silence'] = False
# Add chunk to buffer regardless of silence state
client['streaming_buffer'].append(audio_chunk)
# Check if silence has persisted long enough to consider "stopped talking"
silence_elapsed = time.time() - client['last_active_time']
if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0:
# User has stopped talking - process the collected audio
print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence")
full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process with WhisperX speech-to-text
print(f"[{client_id}] Starting transcription with WhisperX...")
transcribed_text = transcribe_audio(full_audio)
# Log the transcription
print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
# Add to conversation context
if transcribed_text:
user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
client['context_segments'].append(user_segment)
# Generate a contextual response
response_text = generate_response(transcribed_text, client['context_segments'])
# Send the transcribed text to client
emit('transcription', {
'type': 'transcription',
'text': transcribed_text
})
# Generate audio for the response
audio_tensor = generator.generate(
text=response_text,
speaker=1 if speaker_id == 0 else 0, # Use opposite speaker
context=client['context_segments'],
max_audio_length_ms=10_000,
)
# Add response to context
ai_segment = Segment(
text=response_text,
speaker=1 if speaker_id == 0 else 0,
audio=audio_tensor
)
client['context_segments'].append(ai_segment)
# Convert audio to base64 and send back to client
audio_base64 = encode_audio_data(audio_tensor)
emit('audio_response', {
'type': 'audio_response',
'text': response_text,
'audio': audio_base64
})
else:
# If transcription failed, send a generic response
emit('error', {
'type': 'error',
'message': "Sorry, I couldn't understand what you said. Could you try again?"
})
# Clear buffer and reset silence detection
client['streaming_buffer'] = []
client['energy_window'].clear()
client['is_silence'] = False
client['last_active_time'] = time.time()
# If buffer gets too large without silence, process it anyway
elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec
full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process with WhisperX speech-to-text
transcribed_text = transcribe_audio(full_audio)
if transcribed_text:
client['context_segments'].append(
Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
)
# Send the transcribed text to client
emit('transcription', {
'type': 'transcription',
'text': transcribed_text + " (processing continued speech...)"
})
client['streaming_buffer'] = []
except Exception as e:
import traceback
traceback.print_exc()
print(f"Error processing streaming audio: {str(e)}")
emit('error', {
'type': 'error',
'message': f"Error processing streaming audio: {str(e)}"
})
@socketio.on('stop_streaming')
def handle_stop_streaming(data):
client_id = request.sid
if client_id not in active_clients:
return
client = active_clients[client_id]
client['is_streaming'] = False
if client['streaming_buffer'] and len(client['streaming_buffer']) > 5:
# Process any remaining audio in the buffer
full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process with WhisperX speech-to-text
transcribed_text = transcribe_audio(full_audio)
if transcribed_text:
client['context_segments'].append(
Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio)
)
# Send the transcribed text to client
emit('transcription', {
'type': 'transcription',
'text': transcribed_text
})
client['streaming_buffer'] = []
emit('streaming_status', {
'type': 'streaming_status',
'status': 'stopped'
})
# Update the __main__ block with a comprehensive server startup message
if __name__ == "__main__":
print(f"\n{'='*60}")
print(f"🔊 Sesame AI Voice Chat Server")
print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
print(f"{'='*60}")
print(f"📡 Server Information:")
print(f" - Local URL: http://localhost:8000")
print(f" - Network URL: http://<your-ip-address>:8000")
print(f" - WebSocket: ws://<your-ip-address>:8000/ws")
print(f" - Local URL: http://localhost:5000")
print(f" - Network URL: http://<your-ip-address>:5000")
print(f" - WebSocket: ws://<your-ip-address>:5000/socket.io")
print(f"{'='*60}")
print(f"💡 To make this server public:")
print(f" 1. Ensure port 8000 is open in your firewall")
print(f" 2. Set up port forwarding on your router to port 8000")
print(f" 3. Or use a service like ngrok with: ngrok http 8000")
print(f" 1. Ensure port 5000 is open in your firewall")
print(f" 2. Set up port forwarding on your router to port 5000")
print(f" 3. Or use a service like ngrok with: ngrok http 5000")
print(f"{'='*60}")
print(f"🌐 Device: {device.upper()}")
print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})")
@@ -503,5 +469,4 @@ if __name__ == "__main__":
print(f"{'='*60}")
print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")
# Start the server
uvicorn.run(app, host="0.0.0.0", port=8000)
socketio.run(app, host="0.0.0.0", port=5000, debug=False)