diff --git a/Backend/index.html b/Backend/index.html index 0e4006e..2944700 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -1,9 +1,13 @@ +/Backend/index.html --> Sesame AI Voice Chat + + + @@ -162,8 +204,8 @@ - - + +
@@ -173,7 +215,7 @@ \ No newline at end of file diff --git a/Backend/server.py b/Backend/server.py index f159025..e986606 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -1,24 +1,20 @@ import os import base64 import json -import asyncio import torch import torchaudio import numpy as np -import io import whisperx from io import BytesIO from typing import List, Dict, Any, Optional -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request -from fastapi.responses import HTMLResponse, FileResponse -from fastapi.staticfiles import StaticFiles -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel +from flask import Flask, request, send_from_directory, Response +from flask_cors import CORS +from flask_socketio import SocketIO, emit, disconnect from generator import load_csm_1b, Segment -import uvicorn import time import gc from collections import deque +from threading import Lock # Select device if torch.cuda.is_available(): @@ -36,73 +32,39 @@ print("Loading WhisperX model...") asr_model = whisperx.load_model("medium", device, compute_type="float16") print("WhisperX model loaded!") -app = FastAPI() - -# Add CORS middleware to allow cross-origin requests -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # Allow all origins in development - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) +# Silence detection parameters +SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization +SILENCE_DURATION_SEC = 1.0 # How long silence must persist # Define the base directory base_dir = os.path.dirname(os.path.abspath(__file__)) - -# Mount a static files directory if you have any static assets like CSS or JS static_dir = os.path.join(base_dir, "static") -os.makedirs(static_dir, exist_ok=True) # Create the directory if it doesn't exist -app.mount("/static", StaticFiles(directory=static_dir), name="static") +os.makedirs(static_dir, exist_ok=True) -# Define route to serve index.html as the main page -@app.get("/", response_class=HTMLResponse) -async def get_index(): - try: - with open(os.path.join(base_dir, "index.html"), "r") as f: - return HTMLResponse(content=f.read()) - except FileNotFoundError: - return HTMLResponse(content="

Error: index.html not found

") +# Setup Flask +app = Flask(__name__) +CORS(app) +socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') -# Add a favicon endpoint (optional, but good to have) -@app.get("/favicon.ico") -async def get_favicon(): - favicon_path = os.path.join(static_dir, "favicon.ico") - if os.path.exists(favicon_path): - return FileResponse(favicon_path) - else: - return HTMLResponse(status_code=204) # No content - -# Connection manager to handle multiple clients -class ConnectionManager: - def __init__(self): - self.active_connections: List[WebSocket] = [] - - async def connect(self, websocket: WebSocket): - await websocket.accept() - self.active_connections.append(websocket) - - def disconnect(self, websocket: WebSocket): - self.active_connections.remove(websocket) - -manager = ConnectionManager() - -# Silence detection parameters -SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization -SILENCE_DURATION_SEC = 1.0 # How long silence must persist to be considered "stopped talking" +# Socket connection management +thread = None +thread_lock = Lock() +active_clients = {} # Map client_id to client context # Helper function to convert audio data -async def decode_audio_data(audio_data: str) -> torch.Tensor: +def decode_audio_data(audio_data: str) -> torch.Tensor: """Decode base64 audio data to a torch tensor""" try: + # Extract the actual base64 content + if ',' in audio_data: + audio_data = audio_data.split(',')[1] + # Decode base64 audio data - binary_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data) + binary_data = base64.b64decode(audio_data) - # Save to a temporary WAV file first - temp_file = BytesIO(binary_data) - - # Load audio from binary data, explicitly specifying the format - audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") + # Load audio from binary data + with BytesIO(binary_data) as temp_file: + audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") # Resample if needed if sample_rate != generator.sample_rate: @@ -121,7 +83,7 @@ async def decode_audio_data(audio_data: str) -> torch.Tensor: return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence -async def encode_audio_data(audio_tensor: torch.Tensor) -> str: +def encode_audio_data(audio_tensor: torch.Tensor) -> str: """Encode torch tensor audio to base64 string""" buf = BytesIO() torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav") @@ -130,40 +92,36 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str: return f"data:audio/wav;base64,{audio_base64}" -async def transcribe_audio(audio_tensor: torch.Tensor) -> str: +def transcribe_audio(audio_tensor: torch.Tensor) -> str: """Transcribe audio using WhisperX""" try: # Save the tensor to a temporary file - temp_file = BytesIO() - torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav") - temp_file.seek(0) - - # Create a temporary file on disk (WhisperX requires a file path) - temp_path = "temp_audio.wav" - with open(temp_path, "wb") as f: - f.write(temp_file.read()) + temp_path = os.path.join(base_dir, "temp_audio.wav") + torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) # Load and transcribe the audio audio = whisperx.load_audio(temp_path) result = asr_model.transcribe(audio, batch_size=16) # Clean up - os.remove(temp_path) + if os.path.exists(temp_path): + os.remove(temp_path) # Get the transcription text if result["segments"] and len(result["segments"]) > 0: # Combine all segments transcription = " ".join([segment["text"] for segment in result["segments"]]) - print(f"Transcription: {transcription}") return transcription.strip() else: return "" except Exception as e: print(f"Error in transcription: {str(e)}") + if os.path.exists("temp_audio.wav"): + os.remove("temp_audio.wav") return "" -async def generate_response(text: str, conversation_history: List[Segment]) -> str: +def generate_response(text: str, conversation_history: List[Segment]) -> str: """Generate a contextual response based on the transcribed text""" # Simple response logic - can be replaced with a more sophisticated LLM in the future responses = { @@ -191,311 +149,319 @@ async def generate_response(text: str, conversation_history: List[Segment]) -> s else: return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?" +# Flask routes for serving static content +@app.route('/') +def index(): + return send_from_directory(base_dir, 'index.html') -@app.websocket("/ws") -async def websocket_endpoint(websocket: WebSocket): - await manager.connect(websocket) - context_segments = [] # Store conversation context - streaming_buffer = [] # Buffer for streaming audio chunks - is_streaming = False +@app.route('/favicon.ico') +def favicon(): + if os.path.exists(os.path.join(static_dir, 'favicon.ico')): + return send_from_directory(static_dir, 'favicon.ico') + return Response(status=204) + +@app.route('/static/') +def serve_static(path): + return send_from_directory(static_dir, path) + +# Socket.IO event handlers +@socketio.on('connect') +def handle_connect(): + client_id = request.sid + print(f"Client connected: {client_id}") - # Variables for silence detection - last_active_time = time.time() - is_silence = False - energy_window = deque(maxlen=10) # For tracking recent audio energy + # Initialize client context + active_clients[client_id] = { + 'context_segments': [], + 'streaming_buffer': [], + 'is_streaming': False, + 'is_silence': False, + 'last_active_time': time.time(), + 'energy_window': deque(maxlen=10) + } + + emit('status', {'type': 'connected', 'message': 'Connected to server'}) + +@socketio.on('disconnect') +def handle_disconnect(): + client_id = request.sid + if client_id in active_clients: + del active_clients[client_id] + print(f"Client disconnected: {client_id}") + +@socketio.on('generate') +def handle_generate(data): + client_id = request.sid + if client_id not in active_clients: + emit('error', {'message': 'Client not registered'}) + return try: - while True: - # Receive JSON data from client - data = await websocket.receive_text() - request = json.loads(data) - - action = request.get("action") - - if action == "generate": - try: - text = request.get("text", "") - speaker_id = request.get("speaker", 0) - - # Generate audio response - print(f"Generating audio for: '{text}' with speaker {speaker_id}") - audio_tensor = generator.generate( - text=text, - speaker=speaker_id, - context=context_segments, - max_audio_length_ms=10_000, - ) - - # Add to conversation context - context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor)) - - # Convert audio to base64 and send back to client - audio_base64 = await encode_audio_data(audio_tensor) - await websocket.send_json({ - "type": "audio_response", - "audio": audio_base64 - }) - except Exception as e: - print(f"Error generating audio: {str(e)}") - await websocket.send_json({ - "type": "error", - "message": f"Error generating audio: {str(e)}" - }) - - elif action == "add_to_context": - try: - text = request.get("text", "") - speaker_id = request.get("speaker", 0) - audio_data = request.get("audio", "") - - # Convert received audio to tensor - audio_tensor = await decode_audio_data(audio_data) - - # Add to conversation context - context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor)) - - await websocket.send_json({ - "type": "context_updated", - "message": "Audio added to context" - }) - except Exception as e: - print(f"Error adding to context: {str(e)}") - await websocket.send_json({ - "type": "error", - "message": f"Error processing audio: {str(e)}" - }) - - elif action == "clear_context": - context_segments = [] - await websocket.send_json({ - "type": "context_updated", - "message": "Context cleared" - }) - - elif action == "stream_audio": - try: - speaker_id = request.get("speaker", 0) - audio_data = request.get("audio", "") - - # Convert received audio to tensor - audio_chunk = await decode_audio_data(audio_data) - - # Start streaming mode if not already started - if not is_streaming: - is_streaming = True - streaming_buffer = [] - energy_window.clear() - is_silence = False - last_active_time = time.time() - print(f"Streaming started with speaker ID: {speaker_id}") - await websocket.send_json({ - "type": "streaming_status", - "status": "started" - }) - - # Calculate audio energy for silence detection - chunk_energy = torch.mean(torch.abs(audio_chunk)).item() - energy_window.append(chunk_energy) - avg_energy = sum(energy_window) / len(energy_window) - - # Debug audio levels - if len(energy_window) >= 5: # Only start printing after we have enough samples - if avg_energy > SILENCE_THRESHOLD: - print(f"[AUDIO] Active sound detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})") - else: - print(f"[AUDIO] Silence detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})") - - # Check if audio is silent - current_silence = avg_energy < SILENCE_THRESHOLD - - # Track silence transition - if not is_silence and current_silence: - # Transition to silence - is_silence = True - last_active_time = time.time() - print("[STREAM] Transition to silence detected") - elif is_silence and not current_silence: - # User started talking again - is_silence = False - print("[STREAM] User resumed speaking") - - # Add chunk to buffer regardless of silence state - streaming_buffer.append(audio_chunk) - - # Debug buffer size periodically - if len(streaming_buffer) % 10 == 0: - print(f"[BUFFER] Current size: {len(streaming_buffer)} chunks, ~{len(streaming_buffer)/5:.1f} seconds") - - # Check if silence has persisted long enough to consider "stopped talking" - silence_elapsed = time.time() - last_active_time - - if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0: - # User has stopped talking - process the collected audio - print(f"[STREAM] Processing audio after {silence_elapsed:.2f}s of silence") - print(f"[STREAM] Processing {len(streaming_buffer)} audio chunks (~{len(streaming_buffer)/5:.1f} seconds)") - - full_audio = torch.cat(streaming_buffer, dim=0) - - # Log audio statistics - audio_duration = len(full_audio) / generator.sample_rate - audio_min = torch.min(full_audio).item() - audio_max = torch.max(full_audio).item() - audio_mean = torch.mean(full_audio).item() - print(f"[AUDIO] Processed audio - Duration: {audio_duration:.2f}s, Min: {audio_min:.4f}, Max: {audio_max:.4f}, Mean: {audio_mean:.4f}") - - # Process with WhisperX speech-to-text - print("[ASR] Starting transcription with WhisperX...") - transcribed_text = await transcribe_audio(full_audio) - - # Log the transcription - print(f"[ASR] Transcribed text: '{transcribed_text}'") - - # Add to conversation context - if transcribed_text: - print(f"[DIALOG] Adding user utterance to context: '{transcribed_text}'") - user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) - context_segments.append(user_segment) - - # Generate a contextual response - print("[DIALOG] Generating response...") - response_text = await generate_response(transcribed_text, context_segments) - print(f"[DIALOG] Response text: '{response_text}'") - - # Send the transcribed text to client - await websocket.send_json({ - "type": "transcription", - "text": transcribed_text - }) - - # Generate audio for the response - print("[TTS] Generating speech for response...") - audio_tensor = generator.generate( - text=response_text, - speaker=1 if speaker_id == 0 else 0, # Use opposite speaker - context=context_segments, - max_audio_length_ms=10_000, - ) - print(f"[TTS] Generated audio length: {len(audio_tensor)/generator.sample_rate:.2f}s") - - # Add response to context - ai_segment = Segment( - text=response_text, - speaker=1 if speaker_id == 0 else 0, - audio=audio_tensor - ) - context_segments.append(ai_segment) - print(f"[DIALOG] Context now has {len(context_segments)} segments") - - # Convert audio to base64 and send back to client - audio_base64 = await encode_audio_data(audio_tensor) - print("[STREAM] Sending audio response to client") - await websocket.send_json({ - "type": "audio_response", - "text": response_text, - "audio": audio_base64 - }) - else: - print("[ASR] Transcription failed or returned empty text") - # If transcription failed, send a generic response - await websocket.send_json({ - "type": "error", - "message": "Sorry, I couldn't understand what you said. Could you try again?" - }) - - # Clear buffer and reset silence detection - streaming_buffer = [] - energy_window.clear() - is_silence = False - last_active_time = time.time() - print("[STREAM] Buffer cleared, ready for next utterance") - - # If buffer gets too large without silence, process it anyway - # This prevents memory issues with very long streams - elif len(streaming_buffer) >= 30: # ~6 seconds of audio at 5 chunks/sec - print("[BUFFER] Maximum buffer size reached, processing audio") - full_audio = torch.cat(streaming_buffer, dim=0) - - # Process with WhisperX speech-to-text - print("[ASR] Starting forced transcription of long audio...") - transcribed_text = await transcribe_audio(full_audio) - - if transcribed_text: - print(f"[ASR] Transcribed long audio: '{transcribed_text}'") - context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)) - - # Send the transcribed text to client - await websocket.send_json({ - "type": "transcription", - "text": transcribed_text + " (processing continued speech...)" - }) - else: - print("[ASR] No transcription from long audio") - - streaming_buffer = [] - print("[BUFFER] Buffer cleared due to size limit") - - except Exception as e: - print(f"[ERROR] Processing streaming audio: {str(e)}") - # Print traceback for more detailed error information - import traceback - traceback.print_exc() - await websocket.send_json({ - "type": "error", - "message": f"Error processing streaming audio: {str(e)}" - }) - - elif action == "stop_streaming": - is_streaming = False - if streaming_buffer and len(streaming_buffer) > 5: # Only process if there's meaningful audio - # Process any remaining audio in the buffer - full_audio = torch.cat(streaming_buffer, dim=0) - - # Process with WhisperX speech-to-text - transcribed_text = await transcribe_audio(full_audio) - - if transcribed_text: - context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio)) - - # Send the transcribed text to client - await websocket.send_json({ - "type": "transcription", - "text": transcribed_text - }) - - streaming_buffer = [] - await websocket.send_json({ - "type": "streaming_status", - "status": "stopped" - }) - - except WebSocketDisconnect: - manager.disconnect(websocket) - print("Client disconnected") + text = data.get('text', '') + speaker_id = data.get('speaker', 0) + + print(f"Generating audio for: '{text}' with speaker {speaker_id}") + + # Generate audio response + audio_tensor = generator.generate( + text=text, + speaker=speaker_id, + context=active_clients[client_id]['context_segments'], + max_audio_length_ms=10_000, + ) + + # Add to conversation context + active_clients[client_id]['context_segments'].append( + Segment(text=text, speaker=speaker_id, audio=audio_tensor) + ) + + # Convert audio to base64 and send back to client + audio_base64 = encode_audio_data(audio_tensor) + emit('audio_response', { + 'type': 'audio_response', + 'audio': audio_base64 + }) + except Exception as e: - print(f"Error: {str(e)}") - try: - await websocket.send_json({ - "type": "error", - "message": str(e) - }) - except: - pass - manager.disconnect(websocket) + print(f"Error generating audio: {str(e)}") + emit('error', { + 'type': 'error', + 'message': f"Error generating audio: {str(e)}" + }) + +@socketio.on('add_to_context') +def handle_add_to_context(data): + client_id = request.sid + if client_id not in active_clients: + emit('error', {'message': 'Client not registered'}) + return + + try: + text = data.get('text', '') + speaker_id = data.get('speaker', 0) + audio_data = data.get('audio', '') + + # Convert received audio to tensor + audio_tensor = decode_audio_data(audio_data) + + # Add to conversation context + active_clients[client_id]['context_segments'].append( + Segment(text=text, speaker=speaker_id, audio=audio_tensor) + ) + + emit('context_updated', { + 'type': 'context_updated', + 'message': 'Audio added to context' + }) + + except Exception as e: + print(f"Error adding to context: {str(e)}") + emit('error', { + 'type': 'error', + 'message': f"Error processing audio: {str(e)}" + }) + +@socketio.on('clear_context') +def handle_clear_context(): + client_id = request.sid + if client_id in active_clients: + active_clients[client_id]['context_segments'] = [] + + emit('context_updated', { + 'type': 'context_updated', + 'message': 'Context cleared' + }) + +@socketio.on('stream_audio') +def handle_stream_audio(data): + client_id = request.sid + if client_id not in active_clients: + emit('error', {'message': 'Client not registered'}) + return + + client = active_clients[client_id] + + try: + speaker_id = data.get('speaker', 0) + audio_data = data.get('audio', '') + + # Convert received audio to tensor + audio_chunk = decode_audio_data(audio_data) + + # Start streaming mode if not already started + if not client['is_streaming']: + client['is_streaming'] = True + client['streaming_buffer'] = [] + client['energy_window'].clear() + client['is_silence'] = False + client['last_active_time'] = time.time() + print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}") + emit('streaming_status', { + 'type': 'streaming_status', + 'status': 'started' + }) + + # Calculate audio energy for silence detection + chunk_energy = torch.mean(torch.abs(audio_chunk)).item() + client['energy_window'].append(chunk_energy) + avg_energy = sum(client['energy_window']) / len(client['energy_window']) + + # Check if audio is silent + current_silence = avg_energy < SILENCE_THRESHOLD + + # Track silence transition + if not client['is_silence'] and current_silence: + # Transition to silence + client['is_silence'] = True + client['last_active_time'] = time.time() + elif client['is_silence'] and not current_silence: + # User started talking again + client['is_silence'] = False + + # Add chunk to buffer regardless of silence state + client['streaming_buffer'].append(audio_chunk) + + # Check if silence has persisted long enough to consider "stopped talking" + silence_elapsed = time.time() - client['last_active_time'] + + if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0: + # User has stopped talking - process the collected audio + print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence") + + full_audio = torch.cat(client['streaming_buffer'], dim=0) + + # Process with WhisperX speech-to-text + print(f"[{client_id}] Starting transcription with WhisperX...") + transcribed_text = transcribe_audio(full_audio) + + # Log the transcription + print(f"[{client_id}] Transcribed text: '{transcribed_text}'") + + # Add to conversation context + if transcribed_text: + user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) + client['context_segments'].append(user_segment) + + # Generate a contextual response + response_text = generate_response(transcribed_text, client['context_segments']) + + # Send the transcribed text to client + emit('transcription', { + 'type': 'transcription', + 'text': transcribed_text + }) + + # Generate audio for the response + audio_tensor = generator.generate( + text=response_text, + speaker=1 if speaker_id == 0 else 0, # Use opposite speaker + context=client['context_segments'], + max_audio_length_ms=10_000, + ) + + # Add response to context + ai_segment = Segment( + text=response_text, + speaker=1 if speaker_id == 0 else 0, + audio=audio_tensor + ) + client['context_segments'].append(ai_segment) + + # Convert audio to base64 and send back to client + audio_base64 = encode_audio_data(audio_tensor) + emit('audio_response', { + 'type': 'audio_response', + 'text': response_text, + 'audio': audio_base64 + }) + else: + # If transcription failed, send a generic response + emit('error', { + 'type': 'error', + 'message': "Sorry, I couldn't understand what you said. Could you try again?" + }) + + # Clear buffer and reset silence detection + client['streaming_buffer'] = [] + client['energy_window'].clear() + client['is_silence'] = False + client['last_active_time'] = time.time() + + # If buffer gets too large without silence, process it anyway + elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec + full_audio = torch.cat(client['streaming_buffer'], dim=0) + + # Process with WhisperX speech-to-text + transcribed_text = transcribe_audio(full_audio) + + if transcribed_text: + client['context_segments'].append( + Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) + ) + + # Send the transcribed text to client + emit('transcription', { + 'type': 'transcription', + 'text': transcribed_text + " (processing continued speech...)" + }) + + client['streaming_buffer'] = [] + + except Exception as e: + import traceback + traceback.print_exc() + print(f"Error processing streaming audio: {str(e)}") + emit('error', { + 'type': 'error', + 'message': f"Error processing streaming audio: {str(e)}" + }) + +@socketio.on('stop_streaming') +def handle_stop_streaming(data): + client_id = request.sid + if client_id not in active_clients: + return + + client = active_clients[client_id] + client['is_streaming'] = False + + if client['streaming_buffer'] and len(client['streaming_buffer']) > 5: + # Process any remaining audio in the buffer + full_audio = torch.cat(client['streaming_buffer'], dim=0) + + # Process with WhisperX speech-to-text + transcribed_text = transcribe_audio(full_audio) + + if transcribed_text: + client['context_segments'].append( + Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio) + ) + + # Send the transcribed text to client + emit('transcription', { + 'type': 'transcription', + 'text': transcribed_text + }) + + client['streaming_buffer'] = [] + emit('streaming_status', { + 'type': 'streaming_status', + 'status': 'stopped' + }) -# Update the __main__ block with a comprehensive server startup message if __name__ == "__main__": print(f"\n{'='*60}") - print(f"🔊 Sesame AI Voice Chat Server") + print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)") print(f"{'='*60}") print(f"📡 Server Information:") - print(f" - Local URL: http://localhost:8000") - print(f" - Network URL: http://:8000") - print(f" - WebSocket: ws://:8000/ws") + print(f" - Local URL: http://localhost:5000") + print(f" - Network URL: http://:5000") + print(f" - WebSocket: ws://:5000/socket.io") print(f"{'='*60}") print(f"💡 To make this server public:") - print(f" 1. Ensure port 8000 is open in your firewall") - print(f" 2. Set up port forwarding on your router to port 8000") - print(f" 3. Or use a service like ngrok with: ngrok http 8000") + print(f" 1. Ensure port 5000 is open in your firewall") + print(f" 2. Set up port forwarding on your router to port 5000") + print(f" 3. Or use a service like ngrok with: ngrok http 5000") print(f"{'='*60}") print(f"🌐 Device: {device.upper()}") print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})") @@ -503,5 +469,4 @@ if __name__ == "__main__": print(f"{'='*60}") print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n") - # Start the server - uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file + socketio.run(app, host="0.0.0.0", port=5000, debug=False) \ No newline at end of file