From 06fa7936a3d3a0b918a0ad0c8b98a8959f643db1 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sat, 29 Mar 2025 22:06:00 -0400 Subject: [PATCH 01/16] Backend Server Update --- .gitignore | 1 + Backend/index.html | 461 ++++++++++++++++++++------------------------- Backend/server.py | 182 +++++++++++++++--- 3 files changed, 360 insertions(+), 284 deletions(-) diff --git a/.gitignore b/.gitignore index 1170717..e06d006 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,4 @@ dist .yarn/build-state.yml .yarn/install-state.gz .pnp.* +Backend/test.py diff --git a/Backend/index.html b/Backend/index.html index 309364f..f4ff6a0 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -10,60 +10,113 @@ max-width: 800px; margin: 0 auto; padding: 20px; + background-color: #f9f9f9; } .conversation { - border: 1px solid #ccc; - border-radius: 8px; - padding: 15px; - height: 300px; + border: 1px solid #ddd; + border-radius: 12px; + padding: 20px; + height: 400px; overflow-y: auto; - margin-bottom: 15px; + margin-bottom: 20px; + background-color: white; + box-shadow: 0 2px 10px rgba(0,0,0,0.05); } .message { - margin-bottom: 10px; - padding: 8px; - border-radius: 8px; + margin-bottom: 15px; + padding: 12px; + border-radius: 12px; + max-width: 80%; + line-height: 1.4; } .user { background-color: #e3f2fd; text-align: right; + margin-left: auto; + border-bottom-right-radius: 4px; } .ai { background-color: #f1f1f1; + margin-right: auto; + border-bottom-left-radius: 4px; + } + .system { + background-color: #f8f9fa; + font-style: italic; + text-align: center; + font-size: 0.9em; + color: #666; + padding: 8px; + margin: 10px auto; + max-width: 90%; } .controls { display: flex; - flex-direction: column; - gap: 10px; - } - .input-row { - display: flex; - gap: 10px; - } - input[type="text"] { - flex-grow: 1; - padding: 8px; - border-radius: 4px; - border: 1px solid #ccc; + gap: 15px; + justify-content: center; + align-items: center; } button { - padding: 8px 16px; - border-radius: 4px; + padding: 12px 24px; + border-radius: 24px; border: none; background-color: #4CAF50; color: white; cursor: pointer; + font-weight: bold; + transition: all 0.2s ease; + box-shadow: 0 2px 5px rgba(0,0,0,0.1); } button:hover { background-color: #45a049; + box-shadow: 0 4px 8px rgba(0,0,0,0.15); } .recording { background-color: #f44336; + animation: pulse 1.5s infinite; + } + .processing { + background-color: #FFA500; } select { - padding: 8px; - border-radius: 4px; - border: 1px solid #ccc; + padding: 10px; + border-radius: 24px; + border: 1px solid #ddd; + background-color: white; + } + .transcript { + font-style: italic; + color: #666; + margin-top: 5px; + } + @keyframes pulse { + 0% { opacity: 1; } + 50% { opacity: 0.7; } + 100% { opacity: 1; } + } + .status-indicator { + display: flex; + align-items: center; + justify-content: center; + margin-top: 10px; + gap: 5px; + } + .status-dot { + width: 10px; + height: 10px; + border-radius: 50%; + background-color: #ccc; + } + .status-dot.active { + background-color: #4CAF50; + } + .status-text { + font-size: 0.9em; + color: #666; + } + audio { + width: 100%; + margin-top: 5px; } @@ -72,30 +125,25 @@
-
- - - -
- -
- - -
+ + + +
+ +
+
+
Not connected
\ No newline at end of file diff --git a/Backend/server.py b/Backend/server.py index bfdc590..b9736b5 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -5,6 +5,8 @@ import asyncio import torch import torchaudio import numpy as np +import io +import whisperx from io import BytesIO from typing import List, Dict, Any, Optional from fastapi import FastAPI, WebSocket, WebSocketDisconnect @@ -13,6 +15,7 @@ from pydantic import BaseModel from generator import load_csm_1b, Segment import uvicorn import time +import gc from collections import deque # Select device @@ -25,6 +28,12 @@ print(f"Using device: {device}") # Initialize the model generator = load_csm_1b(device=device) +# Initialize WhisperX for ASR +print("Loading WhisperX model...") +# Use a smaller model for faster response times +asr_model = whisperx.load_model("medium", device, compute_type="float16") +print("WhisperX model loaded!") + app = FastAPI() # Add CORS middleware to allow cross-origin requests @@ -93,6 +102,68 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str: return f"data:audio/wav;base64,{audio_base64}" +async def transcribe_audio(audio_tensor: torch.Tensor) -> str: + """Transcribe audio using WhisperX""" + try: + # Save the tensor to a temporary file + temp_file = BytesIO() + torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav") + temp_file.seek(0) + + # Create a temporary file on disk (WhisperX requires a file path) + temp_path = "temp_audio.wav" + with open(temp_path, "wb") as f: + f.write(temp_file.read()) + + # Load and transcribe the audio + audio = whisperx.load_audio(temp_path) + result = asr_model.transcribe(audio, batch_size=16) + + # Clean up + os.remove(temp_path) + + # Get the transcription text + if result["segments"] and len(result["segments"]) > 0: + # Combine all segments + transcription = " ".join([segment["text"] for segment in result["segments"]]) + print(f"Transcription: {transcription}") + return transcription.strip() + else: + return "" + except Exception as e: + print(f"Error in transcription: {str(e)}") + return "" + + +async def generate_response(text: str, conversation_history: List[Segment]) -> str: + """Generate a contextual response based on the transcribed text""" + # Simple response logic - can be replaced with a more sophisticated LLM in the future + responses = { + "hello": "Hello there! How are you doing today?", + "how are you": "I'm doing well, thanks for asking! How about you?", + "what is your name": "I'm Sesame, your voice assistant. How can I help you?", + "bye": "Goodbye! It was nice chatting with you.", + "thank you": "You're welcome! Is there anything else I can help with?", + "weather": "I don't have real-time weather data, but I hope it's nice where you are!", + "help": "I can chat with you using natural voice. Just speak normally and I'll respond.", + } + + text_lower = text.lower() + + # Check for matching keywords + for key, response in responses.items(): + if key in text_lower: + return response + + # Default responses based on text length + if not text: + return "I didn't catch that. Could you please repeat?" + elif len(text) < 10: + return "Thanks for your message. Could you elaborate a bit more?" + else: + return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?" + + @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): await manager.connect(websocket) @@ -220,30 +291,55 @@ async def websocket_endpoint(websocket: WebSocket): # User has stopped talking - process the collected audio full_audio = torch.cat(streaming_buffer, dim=0) - # Process with speech-to-text (you would need to implement this) - # For now, just use a placeholder text - text = f"User audio from speaker {speaker_id}" + # Process with WhisperX speech-to-text + transcribed_text = await transcribe_audio(full_audio) - print(f"Detected end of speech, processing {len(streaming_buffer)} chunks") + # Log the transcription + print(f"Transcribed text: '{transcribed_text}'") # Add to conversation context - context_segments.append(Segment(text=text, speaker=speaker_id, audio=full_audio)) - - # Generate response - response_text = "This is a response to what you just said" - audio_tensor = generator.generate( - text=response_text, - speaker=1 if speaker_id == 0 else 0, # Use opposite speaker - context=context_segments, - max_audio_length_ms=10_000, - ) - - # Convert audio to base64 and send back to client - audio_base64 = await encode_audio_data(audio_tensor) - await websocket.send_json({ - "type": "audio_response", - "audio": audio_base64 - }) + if transcribed_text: + user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) + context_segments.append(user_segment) + + # Generate a contextual response + response_text = await generate_response(transcribed_text, context_segments) + + # Send the transcribed text to client + await websocket.send_json({ + "type": "transcription", + "text": transcribed_text + }) + + # Generate audio for the response + audio_tensor = generator.generate( + text=response_text, + speaker=1 if speaker_id == 0 else 0, # Use opposite speaker + context=context_segments, + max_audio_length_ms=10_000, + ) + + # Add response to context + ai_segment = Segment( + text=response_text, + speaker=1 if speaker_id == 0 else 0, + audio=audio_tensor + ) + context_segments.append(ai_segment) + + # Convert audio to base64 and send back to client + audio_base64 = await encode_audio_data(audio_tensor) + await websocket.send_json({ + "type": "audio_response", + "text": response_text, + "audio": audio_base64 + }) + else: + # If transcription failed, send a generic response + await websocket.send_json({ + "type": "error", + "message": "Sorry, I couldn't understand what you said. Could you try again?" + }) # Clear buffer and reset silence detection streaming_buffer = [] @@ -256,8 +352,19 @@ async def websocket_endpoint(websocket: WebSocket): elif len(streaming_buffer) >= 30: # ~6 seconds of audio at 5 chunks/sec print("Buffer limit reached, processing audio") full_audio = torch.cat(streaming_buffer, dim=0) - text = f"Continued speech from speaker {speaker_id}" - context_segments.append(Segment(text=text, speaker=speaker_id, audio=full_audio)) + + # Process with WhisperX speech-to-text + transcribed_text = await transcribe_audio(full_audio) + + if transcribed_text: + context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)) + + # Send the transcribed text to client + await websocket.send_json({ + "type": "transcription", + "text": transcribed_text + " (processing continued speech...)" + }) + streaming_buffer = [] except Exception as e: @@ -269,11 +376,21 @@ async def websocket_endpoint(websocket: WebSocket): elif action == "stop_streaming": is_streaming = False - if streaming_buffer: + if streaming_buffer and len(streaming_buffer) > 5: # Only process if there's meaningful audio # Process any remaining audio in the buffer full_audio = torch.cat(streaming_buffer, dim=0) - text = f"Final streaming audio from speaker {request.get('speaker', 0)}" - context_segments.append(Segment(text=text, speaker=request.get("speaker", 0), audio=full_audio)) + + # Process with WhisperX speech-to-text + transcribed_text = await transcribe_audio(full_audio) + + if transcribed_text: + context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio)) + + # Send the transcribed text to client + await websocket.send_json({ + "type": "transcription", + "text": transcribed_text + }) streaming_buffer = [] await websocket.send_json({ @@ -286,12 +403,15 @@ async def websocket_endpoint(websocket: WebSocket): print("Client disconnected") except Exception as e: print(f"Error: {str(e)}") - await websocket.send_json({ - "type": "error", - "message": str(e) - }) + try: + await websocket.send_json({ + "type": "error", + "message": str(e) + }) + except: + pass manager.disconnect(websocket) if __name__ == "__main__": - uvicorn.run(app, host="localhost", port=8000) \ No newline at end of file + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file From fd1ac0a0d73ea4fc7db66dad7f0ba0584c7d9baa Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sat, 29 Mar 2025 22:14:45 -0400 Subject: [PATCH 02/16] Client side Voice Visualizer --- Backend/index.html | 141 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 137 insertions(+), 4 deletions(-) diff --git a/Backend/index.html b/Backend/index.html index f4ff6a0..7ab431f 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -118,12 +118,45 @@ width: 100%; margin-top: 5px; } + .visualizer-container { + width: 100%; + height: 60px; + background-color: #f5f5f5; + border-radius: 12px; + margin-bottom: 15px; + overflow: hidden; + position: relative; + } + + .audio-visualizer { + width: 100%; + height: 100%; + display: block; + } + + .visualizer-label { + position: absolute; + top: 50%; + left: 50%; + transform: translate(-50%, -50%); + color: #999; + font-size: 0.9em; + pointer-events: none; + opacity: 0.7; + text-align: center; + width: 100%; + }

Sesame AI Voice Chat

+
+ +
Audio levels will appear here when speaking
+
+
- - + +
@@ -173,7 +215,7 @@ \ No newline at end of file diff --git a/Backend/server.py b/Backend/server.py index f159025..e986606 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -1,24 +1,20 @@ import os import base64 import json -import asyncio import torch import torchaudio import numpy as np -import io import whisperx from io import BytesIO from typing import List, Dict, Any, Optional -from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request -from fastapi.responses import HTMLResponse, FileResponse -from fastapi.staticfiles import StaticFiles -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel +from flask import Flask, request, send_from_directory, Response +from flask_cors import CORS +from flask_socketio import SocketIO, emit, disconnect from generator import load_csm_1b, Segment -import uvicorn import time import gc from collections import deque +from threading import Lock # Select device if torch.cuda.is_available(): @@ -36,73 +32,39 @@ print("Loading WhisperX model...") asr_model = whisperx.load_model("medium", device, compute_type="float16") print("WhisperX model loaded!") -app = FastAPI() - -# Add CORS middleware to allow cross-origin requests -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # Allow all origins in development - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) +# Silence detection parameters +SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization +SILENCE_DURATION_SEC = 1.0 # How long silence must persist # Define the base directory base_dir = os.path.dirname(os.path.abspath(__file__)) - -# Mount a static files directory if you have any static assets like CSS or JS static_dir = os.path.join(base_dir, "static") -os.makedirs(static_dir, exist_ok=True) # Create the directory if it doesn't exist -app.mount("/static", StaticFiles(directory=static_dir), name="static") +os.makedirs(static_dir, exist_ok=True) -# Define route to serve index.html as the main page -@app.get("/", response_class=HTMLResponse) -async def get_index(): - try: - with open(os.path.join(base_dir, "index.html"), "r") as f: - return HTMLResponse(content=f.read()) - except FileNotFoundError: - return HTMLResponse(content="

Error: index.html not found

") +# Setup Flask +app = Flask(__name__) +CORS(app) +socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') -# Add a favicon endpoint (optional, but good to have) -@app.get("/favicon.ico") -async def get_favicon(): - favicon_path = os.path.join(static_dir, "favicon.ico") - if os.path.exists(favicon_path): - return FileResponse(favicon_path) - else: - return HTMLResponse(status_code=204) # No content - -# Connection manager to handle multiple clients -class ConnectionManager: - def __init__(self): - self.active_connections: List[WebSocket] = [] - - async def connect(self, websocket: WebSocket): - await websocket.accept() - self.active_connections.append(websocket) - - def disconnect(self, websocket: WebSocket): - self.active_connections.remove(websocket) - -manager = ConnectionManager() - -# Silence detection parameters -SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization -SILENCE_DURATION_SEC = 1.0 # How long silence must persist to be considered "stopped talking" +# Socket connection management +thread = None +thread_lock = Lock() +active_clients = {} # Map client_id to client context # Helper function to convert audio data -async def decode_audio_data(audio_data: str) -> torch.Tensor: +def decode_audio_data(audio_data: str) -> torch.Tensor: """Decode base64 audio data to a torch tensor""" try: + # Extract the actual base64 content + if ',' in audio_data: + audio_data = audio_data.split(',')[1] + # Decode base64 audio data - binary_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data) + binary_data = base64.b64decode(audio_data) - # Save to a temporary WAV file first - temp_file = BytesIO(binary_data) - - # Load audio from binary data, explicitly specifying the format - audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") + # Load audio from binary data + with BytesIO(binary_data) as temp_file: + audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") # Resample if needed if sample_rate != generator.sample_rate: @@ -121,7 +83,7 @@ async def decode_audio_data(audio_data: str) -> torch.Tensor: return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence -async def encode_audio_data(audio_tensor: torch.Tensor) -> str: +def encode_audio_data(audio_tensor: torch.Tensor) -> str: """Encode torch tensor audio to base64 string""" buf = BytesIO() torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav") @@ -130,40 +92,36 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str: return f"data:audio/wav;base64,{audio_base64}" -async def transcribe_audio(audio_tensor: torch.Tensor) -> str: +def transcribe_audio(audio_tensor: torch.Tensor) -> str: """Transcribe audio using WhisperX""" try: # Save the tensor to a temporary file - temp_file = BytesIO() - torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav") - temp_file.seek(0) - - # Create a temporary file on disk (WhisperX requires a file path) - temp_path = "temp_audio.wav" - with open(temp_path, "wb") as f: - f.write(temp_file.read()) + temp_path = os.path.join(base_dir, "temp_audio.wav") + torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) # Load and transcribe the audio audio = whisperx.load_audio(temp_path) result = asr_model.transcribe(audio, batch_size=16) # Clean up - os.remove(temp_path) + if os.path.exists(temp_path): + os.remove(temp_path) # Get the transcription text if result["segments"] and len(result["segments"]) > 0: # Combine all segments transcription = " ".join([segment["text"] for segment in result["segments"]]) - print(f"Transcription: {transcription}") return transcription.strip() else: return "" except Exception as e: print(f"Error in transcription: {str(e)}") + if os.path.exists("temp_audio.wav"): + os.remove("temp_audio.wav") return "" -async def generate_response(text: str, conversation_history: List[Segment]) -> str: +def generate_response(text: str, conversation_history: List[Segment]) -> str: """Generate a contextual response based on the transcribed text""" # Simple response logic - can be replaced with a more sophisticated LLM in the future responses = { @@ -191,311 +149,319 @@ async def generate_response(text: str, conversation_history: List[Segment]) -> s else: return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?" +# Flask routes for serving static content +@app.route('/') +def index(): + return send_from_directory(base_dir, 'index.html') -@app.websocket("/ws") -async def websocket_endpoint(websocket: WebSocket): - await manager.connect(websocket) - context_segments = [] # Store conversation context - streaming_buffer = [] # Buffer for streaming audio chunks - is_streaming = False +@app.route('/favicon.ico') +def favicon(): + if os.path.exists(os.path.join(static_dir, 'favicon.ico')): + return send_from_directory(static_dir, 'favicon.ico') + return Response(status=204) + +@app.route('/static/') +def serve_static(path): + return send_from_directory(static_dir, path) + +# Socket.IO event handlers +@socketio.on('connect') +def handle_connect(): + client_id = request.sid + print(f"Client connected: {client_id}") - # Variables for silence detection - last_active_time = time.time() - is_silence = False - energy_window = deque(maxlen=10) # For tracking recent audio energy + # Initialize client context + active_clients[client_id] = { + 'context_segments': [], + 'streaming_buffer': [], + 'is_streaming': False, + 'is_silence': False, + 'last_active_time': time.time(), + 'energy_window': deque(maxlen=10) + } + + emit('status', {'type': 'connected', 'message': 'Connected to server'}) + +@socketio.on('disconnect') +def handle_disconnect(): + client_id = request.sid + if client_id in active_clients: + del active_clients[client_id] + print(f"Client disconnected: {client_id}") + +@socketio.on('generate') +def handle_generate(data): + client_id = request.sid + if client_id not in active_clients: + emit('error', {'message': 'Client not registered'}) + return try: - while True: - # Receive JSON data from client - data = await websocket.receive_text() - request = json.loads(data) - - action = request.get("action") - - if action == "generate": - try: - text = request.get("text", "") - speaker_id = request.get("speaker", 0) - - # Generate audio response - print(f"Generating audio for: '{text}' with speaker {speaker_id}") - audio_tensor = generator.generate( - text=text, - speaker=speaker_id, - context=context_segments, - max_audio_length_ms=10_000, - ) - - # Add to conversation context - context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor)) - - # Convert audio to base64 and send back to client - audio_base64 = await encode_audio_data(audio_tensor) - await websocket.send_json({ - "type": "audio_response", - "audio": audio_base64 - }) - except Exception as e: - print(f"Error generating audio: {str(e)}") - await websocket.send_json({ - "type": "error", - "message": f"Error generating audio: {str(e)}" - }) - - elif action == "add_to_context": - try: - text = request.get("text", "") - speaker_id = request.get("speaker", 0) - audio_data = request.get("audio", "") - - # Convert received audio to tensor - audio_tensor = await decode_audio_data(audio_data) - - # Add to conversation context - context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor)) - - await websocket.send_json({ - "type": "context_updated", - "message": "Audio added to context" - }) - except Exception as e: - print(f"Error adding to context: {str(e)}") - await websocket.send_json({ - "type": "error", - "message": f"Error processing audio: {str(e)}" - }) - - elif action == "clear_context": - context_segments = [] - await websocket.send_json({ - "type": "context_updated", - "message": "Context cleared" - }) - - elif action == "stream_audio": - try: - speaker_id = request.get("speaker", 0) - audio_data = request.get("audio", "") - - # Convert received audio to tensor - audio_chunk = await decode_audio_data(audio_data) - - # Start streaming mode if not already started - if not is_streaming: - is_streaming = True - streaming_buffer = [] - energy_window.clear() - is_silence = False - last_active_time = time.time() - print(f"Streaming started with speaker ID: {speaker_id}") - await websocket.send_json({ - "type": "streaming_status", - "status": "started" - }) - - # Calculate audio energy for silence detection - chunk_energy = torch.mean(torch.abs(audio_chunk)).item() - energy_window.append(chunk_energy) - avg_energy = sum(energy_window) / len(energy_window) - - # Debug audio levels - if len(energy_window) >= 5: # Only start printing after we have enough samples - if avg_energy > SILENCE_THRESHOLD: - print(f"[AUDIO] Active sound detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})") - else: - print(f"[AUDIO] Silence detected - Energy: {avg_energy:.6f} (threshold: {SILENCE_THRESHOLD})") - - # Check if audio is silent - current_silence = avg_energy < SILENCE_THRESHOLD - - # Track silence transition - if not is_silence and current_silence: - # Transition to silence - is_silence = True - last_active_time = time.time() - print("[STREAM] Transition to silence detected") - elif is_silence and not current_silence: - # User started talking again - is_silence = False - print("[STREAM] User resumed speaking") - - # Add chunk to buffer regardless of silence state - streaming_buffer.append(audio_chunk) - - # Debug buffer size periodically - if len(streaming_buffer) % 10 == 0: - print(f"[BUFFER] Current size: {len(streaming_buffer)} chunks, ~{len(streaming_buffer)/5:.1f} seconds") - - # Check if silence has persisted long enough to consider "stopped talking" - silence_elapsed = time.time() - last_active_time - - if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0: - # User has stopped talking - process the collected audio - print(f"[STREAM] Processing audio after {silence_elapsed:.2f}s of silence") - print(f"[STREAM] Processing {len(streaming_buffer)} audio chunks (~{len(streaming_buffer)/5:.1f} seconds)") - - full_audio = torch.cat(streaming_buffer, dim=0) - - # Log audio statistics - audio_duration = len(full_audio) / generator.sample_rate - audio_min = torch.min(full_audio).item() - audio_max = torch.max(full_audio).item() - audio_mean = torch.mean(full_audio).item() - print(f"[AUDIO] Processed audio - Duration: {audio_duration:.2f}s, Min: {audio_min:.4f}, Max: {audio_max:.4f}, Mean: {audio_mean:.4f}") - - # Process with WhisperX speech-to-text - print("[ASR] Starting transcription with WhisperX...") - transcribed_text = await transcribe_audio(full_audio) - - # Log the transcription - print(f"[ASR] Transcribed text: '{transcribed_text}'") - - # Add to conversation context - if transcribed_text: - print(f"[DIALOG] Adding user utterance to context: '{transcribed_text}'") - user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) - context_segments.append(user_segment) - - # Generate a contextual response - print("[DIALOG] Generating response...") - response_text = await generate_response(transcribed_text, context_segments) - print(f"[DIALOG] Response text: '{response_text}'") - - # Send the transcribed text to client - await websocket.send_json({ - "type": "transcription", - "text": transcribed_text - }) - - # Generate audio for the response - print("[TTS] Generating speech for response...") - audio_tensor = generator.generate( - text=response_text, - speaker=1 if speaker_id == 0 else 0, # Use opposite speaker - context=context_segments, - max_audio_length_ms=10_000, - ) - print(f"[TTS] Generated audio length: {len(audio_tensor)/generator.sample_rate:.2f}s") - - # Add response to context - ai_segment = Segment( - text=response_text, - speaker=1 if speaker_id == 0 else 0, - audio=audio_tensor - ) - context_segments.append(ai_segment) - print(f"[DIALOG] Context now has {len(context_segments)} segments") - - # Convert audio to base64 and send back to client - audio_base64 = await encode_audio_data(audio_tensor) - print("[STREAM] Sending audio response to client") - await websocket.send_json({ - "type": "audio_response", - "text": response_text, - "audio": audio_base64 - }) - else: - print("[ASR] Transcription failed or returned empty text") - # If transcription failed, send a generic response - await websocket.send_json({ - "type": "error", - "message": "Sorry, I couldn't understand what you said. Could you try again?" - }) - - # Clear buffer and reset silence detection - streaming_buffer = [] - energy_window.clear() - is_silence = False - last_active_time = time.time() - print("[STREAM] Buffer cleared, ready for next utterance") - - # If buffer gets too large without silence, process it anyway - # This prevents memory issues with very long streams - elif len(streaming_buffer) >= 30: # ~6 seconds of audio at 5 chunks/sec - print("[BUFFER] Maximum buffer size reached, processing audio") - full_audio = torch.cat(streaming_buffer, dim=0) - - # Process with WhisperX speech-to-text - print("[ASR] Starting forced transcription of long audio...") - transcribed_text = await transcribe_audio(full_audio) - - if transcribed_text: - print(f"[ASR] Transcribed long audio: '{transcribed_text}'") - context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)) - - # Send the transcribed text to client - await websocket.send_json({ - "type": "transcription", - "text": transcribed_text + " (processing continued speech...)" - }) - else: - print("[ASR] No transcription from long audio") - - streaming_buffer = [] - print("[BUFFER] Buffer cleared due to size limit") - - except Exception as e: - print(f"[ERROR] Processing streaming audio: {str(e)}") - # Print traceback for more detailed error information - import traceback - traceback.print_exc() - await websocket.send_json({ - "type": "error", - "message": f"Error processing streaming audio: {str(e)}" - }) - - elif action == "stop_streaming": - is_streaming = False - if streaming_buffer and len(streaming_buffer) > 5: # Only process if there's meaningful audio - # Process any remaining audio in the buffer - full_audio = torch.cat(streaming_buffer, dim=0) - - # Process with WhisperX speech-to-text - transcribed_text = await transcribe_audio(full_audio) - - if transcribed_text: - context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio)) - - # Send the transcribed text to client - await websocket.send_json({ - "type": "transcription", - "text": transcribed_text - }) - - streaming_buffer = [] - await websocket.send_json({ - "type": "streaming_status", - "status": "stopped" - }) - - except WebSocketDisconnect: - manager.disconnect(websocket) - print("Client disconnected") + text = data.get('text', '') + speaker_id = data.get('speaker', 0) + + print(f"Generating audio for: '{text}' with speaker {speaker_id}") + + # Generate audio response + audio_tensor = generator.generate( + text=text, + speaker=speaker_id, + context=active_clients[client_id]['context_segments'], + max_audio_length_ms=10_000, + ) + + # Add to conversation context + active_clients[client_id]['context_segments'].append( + Segment(text=text, speaker=speaker_id, audio=audio_tensor) + ) + + # Convert audio to base64 and send back to client + audio_base64 = encode_audio_data(audio_tensor) + emit('audio_response', { + 'type': 'audio_response', + 'audio': audio_base64 + }) + except Exception as e: - print(f"Error: {str(e)}") - try: - await websocket.send_json({ - "type": "error", - "message": str(e) - }) - except: - pass - manager.disconnect(websocket) + print(f"Error generating audio: {str(e)}") + emit('error', { + 'type': 'error', + 'message': f"Error generating audio: {str(e)}" + }) + +@socketio.on('add_to_context') +def handle_add_to_context(data): + client_id = request.sid + if client_id not in active_clients: + emit('error', {'message': 'Client not registered'}) + return + + try: + text = data.get('text', '') + speaker_id = data.get('speaker', 0) + audio_data = data.get('audio', '') + + # Convert received audio to tensor + audio_tensor = decode_audio_data(audio_data) + + # Add to conversation context + active_clients[client_id]['context_segments'].append( + Segment(text=text, speaker=speaker_id, audio=audio_tensor) + ) + + emit('context_updated', { + 'type': 'context_updated', + 'message': 'Audio added to context' + }) + + except Exception as e: + print(f"Error adding to context: {str(e)}") + emit('error', { + 'type': 'error', + 'message': f"Error processing audio: {str(e)}" + }) + +@socketio.on('clear_context') +def handle_clear_context(): + client_id = request.sid + if client_id in active_clients: + active_clients[client_id]['context_segments'] = [] + + emit('context_updated', { + 'type': 'context_updated', + 'message': 'Context cleared' + }) + +@socketio.on('stream_audio') +def handle_stream_audio(data): + client_id = request.sid + if client_id not in active_clients: + emit('error', {'message': 'Client not registered'}) + return + + client = active_clients[client_id] + + try: + speaker_id = data.get('speaker', 0) + audio_data = data.get('audio', '') + + # Convert received audio to tensor + audio_chunk = decode_audio_data(audio_data) + + # Start streaming mode if not already started + if not client['is_streaming']: + client['is_streaming'] = True + client['streaming_buffer'] = [] + client['energy_window'].clear() + client['is_silence'] = False + client['last_active_time'] = time.time() + print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}") + emit('streaming_status', { + 'type': 'streaming_status', + 'status': 'started' + }) + + # Calculate audio energy for silence detection + chunk_energy = torch.mean(torch.abs(audio_chunk)).item() + client['energy_window'].append(chunk_energy) + avg_energy = sum(client['energy_window']) / len(client['energy_window']) + + # Check if audio is silent + current_silence = avg_energy < SILENCE_THRESHOLD + + # Track silence transition + if not client['is_silence'] and current_silence: + # Transition to silence + client['is_silence'] = True + client['last_active_time'] = time.time() + elif client['is_silence'] and not current_silence: + # User started talking again + client['is_silence'] = False + + # Add chunk to buffer regardless of silence state + client['streaming_buffer'].append(audio_chunk) + + # Check if silence has persisted long enough to consider "stopped talking" + silence_elapsed = time.time() - client['last_active_time'] + + if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0: + # User has stopped talking - process the collected audio + print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence") + + full_audio = torch.cat(client['streaming_buffer'], dim=0) + + # Process with WhisperX speech-to-text + print(f"[{client_id}] Starting transcription with WhisperX...") + transcribed_text = transcribe_audio(full_audio) + + # Log the transcription + print(f"[{client_id}] Transcribed text: '{transcribed_text}'") + + # Add to conversation context + if transcribed_text: + user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) + client['context_segments'].append(user_segment) + + # Generate a contextual response + response_text = generate_response(transcribed_text, client['context_segments']) + + # Send the transcribed text to client + emit('transcription', { + 'type': 'transcription', + 'text': transcribed_text + }) + + # Generate audio for the response + audio_tensor = generator.generate( + text=response_text, + speaker=1 if speaker_id == 0 else 0, # Use opposite speaker + context=client['context_segments'], + max_audio_length_ms=10_000, + ) + + # Add response to context + ai_segment = Segment( + text=response_text, + speaker=1 if speaker_id == 0 else 0, + audio=audio_tensor + ) + client['context_segments'].append(ai_segment) + + # Convert audio to base64 and send back to client + audio_base64 = encode_audio_data(audio_tensor) + emit('audio_response', { + 'type': 'audio_response', + 'text': response_text, + 'audio': audio_base64 + }) + else: + # If transcription failed, send a generic response + emit('error', { + 'type': 'error', + 'message': "Sorry, I couldn't understand what you said. Could you try again?" + }) + + # Clear buffer and reset silence detection + client['streaming_buffer'] = [] + client['energy_window'].clear() + client['is_silence'] = False + client['last_active_time'] = time.time() + + # If buffer gets too large without silence, process it anyway + elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec + full_audio = torch.cat(client['streaming_buffer'], dim=0) + + # Process with WhisperX speech-to-text + transcribed_text = transcribe_audio(full_audio) + + if transcribed_text: + client['context_segments'].append( + Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) + ) + + # Send the transcribed text to client + emit('transcription', { + 'type': 'transcription', + 'text': transcribed_text + " (processing continued speech...)" + }) + + client['streaming_buffer'] = [] + + except Exception as e: + import traceback + traceback.print_exc() + print(f"Error processing streaming audio: {str(e)}") + emit('error', { + 'type': 'error', + 'message': f"Error processing streaming audio: {str(e)}" + }) + +@socketio.on('stop_streaming') +def handle_stop_streaming(data): + client_id = request.sid + if client_id not in active_clients: + return + + client = active_clients[client_id] + client['is_streaming'] = False + + if client['streaming_buffer'] and len(client['streaming_buffer']) > 5: + # Process any remaining audio in the buffer + full_audio = torch.cat(client['streaming_buffer'], dim=0) + + # Process with WhisperX speech-to-text + transcribed_text = transcribe_audio(full_audio) + + if transcribed_text: + client['context_segments'].append( + Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio) + ) + + # Send the transcribed text to client + emit('transcription', { + 'type': 'transcription', + 'text': transcribed_text + }) + + client['streaming_buffer'] = [] + emit('streaming_status', { + 'type': 'streaming_status', + 'status': 'stopped' + }) -# Update the __main__ block with a comprehensive server startup message if __name__ == "__main__": print(f"\n{'='*60}") - print(f"🔊 Sesame AI Voice Chat Server") + print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)") print(f"{'='*60}") print(f"📡 Server Information:") - print(f" - Local URL: http://localhost:8000") - print(f" - Network URL: http://:8000") - print(f" - WebSocket: ws://:8000/ws") + print(f" - Local URL: http://localhost:5000") + print(f" - Network URL: http://:5000") + print(f" - WebSocket: ws://:5000/socket.io") print(f"{'='*60}") print(f"💡 To make this server public:") - print(f" 1. Ensure port 8000 is open in your firewall") - print(f" 2. Set up port forwarding on your router to port 8000") - print(f" 3. Or use a service like ngrok with: ngrok http 8000") + print(f" 1. Ensure port 5000 is open in your firewall") + print(f" 2. Set up port forwarding on your router to port 5000") + print(f" 3. Or use a service like ngrok with: ngrok http 5000") print(f"{'='*60}") print(f"🌐 Device: {device.upper()}") print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})") @@ -503,5 +469,4 @@ if __name__ == "__main__": print(f"{'='*60}") print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n") - # Start the server - uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file + socketio.run(app, host="0.0.0.0", port=5000, debug=False) \ No newline at end of file From 14c08bc93edffe68bbe25f88e9078d26043e26c3 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sat, 29 Mar 2025 23:14:20 -0400 Subject: [PATCH 07/16] Demo Frontend Update --- Backend/index.html | 1157 +++++++++++++++-------------------------- Backend/voice-chat.js | 795 ++++++++++++++++++++++++++++ 2 files changed, 1219 insertions(+), 733 deletions(-) create mode 100644 Backend/voice-chat.js diff --git a/Backend/index.html b/Backend/index.html index 2944700..cbb4172 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -1,801 +1,492 @@ -/Backend/index.html --> Sesame AI Voice Chat - + -

Sesame AI Voice Chat

-
- -
- -
Audio levels will appear here when speaking
-
- -
- - - -
- -
-
-
Not connected
+
+

Sesame AI Voice Chat

+

Speak naturally and have a conversation with AI

+
+ +
+
+
+

Conversation

+ +
+
+
+ +
+
+

Audio Visualizer

+
+ +
Speak to see audio visualization
+
+
+ +
+
+
Voice Settings
+ + +
+
+ Silence Threshold + 0.01 +
+ +
+ +
+
+
+
+ +
+
Conversation Controls
+
+ +
+
+
+ +
+
Settings
+
+
+ + + Auto-play responses +
+
+ + + Show visualizer +
+
+
+ +
+
+
Not connected
+
+
- +
+

Powered by Sesame AI | WhisperX for speech recognition

+
+ + + \ No newline at end of file diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js new file mode 100644 index 0000000..0c8a815 --- /dev/null +++ b/Backend/voice-chat.js @@ -0,0 +1,795 @@ +/** + * Sesame AI Voice Chat Application + * + * This script handles the audio streaming, visualization, + * and Socket.IO communication for the voice chat application. + */ + +// Application state +const state = { + socket: null, + audioContext: null, + streamProcessor: null, + analyser: null, + microphone: null, + isStreaming: false, + isSpeaking: false, + silenceTimer: null, + energyWindow: [], + currentSpeaker: 0, + silenceThreshold: 0.01, + visualizerAnimationFrame: null, + volumeUpdateInterval: null, + connectionAttempts: 0 +}; + +// Constants +const ENERGY_WINDOW_SIZE = 10; +const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second of silence before processing +const MAX_CONNECTION_ATTEMPTS = 5; +const RECONNECTION_DELAY_MS = 2000; + +// DOM elements +const elements = { + conversation: document.getElementById('conversation'), + speakerSelect: document.getElementById('speakerSelect'), + streamButton: document.getElementById('streamButton'), + clearButton: document.getElementById('clearButton'), + statusDot: document.getElementById('statusDot'), + statusText: document.getElementById('statusText'), + visualizerCanvas: document.getElementById('audioVisualizer'), + visualizerLabel: document.getElementById('visualizerLabel'), + thresholdSlider: document.getElementById('thresholdSlider'), + thresholdValue: document.getElementById('thresholdValue'), + volumeLevel: document.getElementById('volumeLevel'), + autoPlayResponses: document.getElementById('autoPlayResponses'), + showVisualizer: document.getElementById('showVisualizer') +}; + +// Visualization variables +let canvasContext; +let visualizerBufferLength; +let visualizerDataArray; + +// Initialize the application +function initializeApp() { + // Set up event listeners + elements.streamButton.addEventListener('click', toggleStreaming); + elements.clearButton.addEventListener('click', clearConversation); + elements.thresholdSlider.addEventListener('input', updateThreshold); + elements.speakerSelect.addEventListener('change', () => { + state.currentSpeaker = parseInt(elements.speakerSelect.value); + }); + elements.showVisualizer.addEventListener('change', toggleVisualizerVisibility); + + // Initialize audio context + setupAudioContext(); + + // Set up visualization + setupVisualizer(); + + // Connect to Socket.IO server + connectToServer(); + + // Add welcome message + addSystemMessage('Welcome to Sesame AI Voice Chat! Click "Start Conversation" to begin speaking.'); +} + +// Connect to Socket.IO server +function connectToServer() { + try { + // Use the server URL with or without a specific port + const serverUrl = window.location.origin; + + updateStatus('Connecting...', 'connecting'); + console.log(`Connecting to Socket.IO server at ${serverUrl}`); + + state.socket = io(serverUrl, { + reconnectionDelay: RECONNECTION_DELAY_MS, + reconnectionDelayMax: 5000, + reconnectionAttempts: MAX_CONNECTION_ATTEMPTS + }); + + setupSocketListeners(); + } catch (error) { + console.error('Error connecting to server:', error); + updateStatus('Connection failed. Retrying...', 'error'); + + // Try to reconnect + if (state.connectionAttempts < MAX_CONNECTION_ATTEMPTS) { + state.connectionAttempts++; + setTimeout(connectToServer, RECONNECTION_DELAY_MS); + } else { + updateStatus('Could not connect to server', 'error'); + addSystemMessage('Failed to connect to the server. Please check your connection and refresh the page.'); + } + } +} + +// Set up Socket.IO event listeners +function setupSocketListeners() { + if (!state.socket) return; + + state.socket.on('connect', () => { + console.log('Connected to Socket.IO server'); + updateStatus('Connected', 'connected'); + state.connectionAttempts = 0; + elements.streamButton.disabled = false; + addSystemMessage('Connected to server'); + }); + + state.socket.on('disconnect', () => { + console.log('Disconnected from Socket.IO server'); + updateStatus('Disconnected', 'disconnected'); + + // Stop streaming if active + if (state.isStreaming) { + stopStreaming(false); // false = don't send to server + } + + elements.streamButton.disabled = true; + addSystemMessage('Disconnected from server. Trying to reconnect...'); + }); + + state.socket.on('status', (data) => { + console.log('Status:', data); + addSystemMessage(data.message); + }); + + state.socket.on('error', (data) => { + console.error('Server error:', data); + addSystemMessage(`Error: ${data.message}`); + }); + + state.socket.on('audio_response', handleAudioResponse); + state.socket.on('transcription', handleTranscription); + state.socket.on('context_updated', handleContextUpdate); + state.socket.on('streaming_status', handleStreamingStatus); + + state.socket.on('connect_error', (error) => { + console.error('Connection error:', error); + updateStatus('Connection Error', 'error'); + }); +} + +// Update the connection status in the UI +function updateStatus(message, status) { + elements.statusText.textContent = message; + elements.statusDot.className = 'status-dot'; + + if (status === 'connected') { + elements.statusDot.classList.add('active'); + } else if (status === 'connecting') { + elements.statusDot.style.backgroundColor = '#FFA500'; + } else if (status === 'error') { + elements.statusDot.style.backgroundColor = '#F44336'; + } +} + +// Set up audio context +function setupAudioContext() { + try { + state.audioContext = new (window.AudioContext || window.webkitAudioContext)(); + console.log('Audio context initialized'); + } catch (err) { + console.error('Error setting up audio context:', err); + addSystemMessage(`Audio context error: ${err.message}`); + elements.streamButton.disabled = true; + } +} + +// Set up audio visualizer +function setupVisualizer() { + canvasContext = elements.visualizerCanvas.getContext('2d'); + + // Set canvas size to match container + function resizeCanvas() { + const container = elements.visualizerCanvas.parentElement; + elements.visualizerCanvas.width = container.clientWidth; + elements.visualizerCanvas.height = container.clientHeight; + } + + // Call initially and on window resize + resizeCanvas(); + window.addEventListener('resize', resizeCanvas); + + // Create placeholder data array + visualizerBufferLength = 128; + visualizerDataArray = new Uint8Array(visualizerBufferLength); +} + +// Toggle stream on/off +function toggleStreaming() { + if (state.isStreaming) { + stopStreaming(true); // true = send to server + } else { + startStreaming(); + } +} + +// Start streaming audio to the server +async function startStreaming() { + if (!state.socket || !state.socket.connected) { + addSystemMessage('Cannot start conversation: Not connected to server'); + return; + } + + try { + // Request microphone access + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + + // Update state + state.isStreaming = true; + state.isSpeaking = false; + state.energyWindow = []; + state.currentSpeaker = parseInt(elements.speakerSelect.value); + + // Update UI + elements.streamButton.innerHTML = ' Listening...'; + elements.streamButton.classList.add('recording'); + elements.visualizerLabel.style.opacity = '0'; + + // Set up audio processing + setupAudioProcessing(stream); + + // Start volume meter updates + state.volumeUpdateInterval = setInterval(updateVolumeMeter, 100); + + addSystemMessage('Listening - speak naturally and pause when finished'); + + } catch (err) { + console.error('Error starting audio stream:', err); + addSystemMessage(`Microphone error: ${err.message}`); + cleanupAudioResources(); + } +} + +// Set up audio processing pipeline +function setupAudioProcessing(stream) { + // Store microphone stream for later cleanup + state.microphone = stream; + + // Create source from microphone + const source = state.audioContext.createMediaStreamSource(stream); + + // Setup analyzer for visualization + state.analyser = state.audioContext.createAnalyser(); + state.analyser.fftSize = 256; + state.analyser.smoothingTimeConstant = 0.8; + state.analyser.minDecibels = -90; + state.analyser.maxDecibels = -10; + + visualizerBufferLength = state.analyser.frequencyBinCount; + visualizerDataArray = new Uint8Array(visualizerBufferLength); + + // Connect source to analyzer + source.connect(state.analyser); + + // Start visualization + if (state.visualizerAnimationFrame) { + cancelAnimationFrame(state.visualizerAnimationFrame); + } + drawVisualizer(); + + // Setup audio processor + state.streamProcessor = state.audioContext.createScriptProcessor(4096, 1, 1); + + // Connect audio nodes + source.connect(state.streamProcessor); + state.streamProcessor.connect(state.audioContext.destination); + + // Process audio + state.streamProcessor.onaudioprocess = handleAudioProcess; +} + +// Handle each frame of audio data +function handleAudioProcess(e) { + const audioData = e.inputBuffer.getChannelData(0); + + // Calculate energy (volume) for silence detection + const energy = calculateAudioEnergy(audioData); + updateEnergyWindow(energy); + + // Check if currently silent + const avgEnergy = calculateAverageEnergy(); + const isSilent = avgEnergy < state.silenceThreshold; + + // Handle silence/speech transitions + handleSpeechState(isSilent); + + // Process and send audio + const downsampled = downsampleBuffer(audioData, state.audioContext.sampleRate, 24000); + sendAudioChunk(downsampled, state.currentSpeaker); +} + +// Stop streaming audio +function stopStreaming(sendToServer = true) { + // Cleanup audio resources + cleanupAudioResources(); + + // Reset state + state.isStreaming = false; + state.isSpeaking = false; + state.energyWindow = []; + + // Update UI + elements.streamButton.innerHTML = ' Start Conversation'; + elements.streamButton.classList.remove('recording', 'processing'); + elements.streamButton.style.backgroundColor = ''; + elements.volumeLevel.style.width = '100%'; + + // Clear volume meter updates + if (state.volumeUpdateInterval) { + clearInterval(state.volumeUpdateInterval); + state.volumeUpdateInterval = null; + } + + addSystemMessage('Conversation paused'); + + // Notify server + if (sendToServer && state.socket && state.socket.connected) { + state.socket.emit('stop_streaming', { + speaker: state.currentSpeaker + }); + } +} + +// Clean up audio processing resources +function cleanupAudioResources() { + // Stop microphone stream + if (state.microphone) { + state.microphone.getTracks().forEach(track => track.stop()); + state.microphone = null; + } + + // Disconnect audio processor + if (state.streamProcessor) { + state.streamProcessor.disconnect(); + state.streamProcessor.onaudioprocess = null; + state.streamProcessor = null; + } + + // Disconnect analyzer + if (state.analyser) { + state.analyser.disconnect(); + state.analyser = null; + } + + // Cancel visualizer animation + if (state.visualizerAnimationFrame) { + cancelAnimationFrame(state.visualizerAnimationFrame); + state.visualizerAnimationFrame = null; + } + + // Cancel silence timer + if (state.silenceTimer) { + clearTimeout(state.silenceTimer); + state.silenceTimer = null; + } + + // Reset visualizer display + if (canvasContext) { + canvasContext.clearRect(0, 0, elements.visualizerCanvas.width, elements.visualizerCanvas.height); + elements.visualizerLabel.style.opacity = '0.7'; + } +} + +// Clear conversation history +function clearConversation() { + // Clear UI + elements.conversation.innerHTML = ''; + addSystemMessage('Conversation cleared'); + + // Notify server + if (state.socket && state.socket.connected) { + state.socket.emit('clear_context'); + } +} + +// Calculate audio energy (volume) +function calculateAudioEnergy(buffer) { + let sum = 0; + for (let i = 0; i < buffer.length; i++) { + sum += Math.abs(buffer[i]); + } + return sum / buffer.length; +} + +// Update energy window for averaging +function updateEnergyWindow(energy) { + state.energyWindow.push(energy); + if (state.energyWindow.length > ENERGY_WINDOW_SIZE) { + state.energyWindow.shift(); + } +} + +// Calculate average energy from window +function calculateAverageEnergy() { + if (state.energyWindow.length === 0) return 0; + return state.energyWindow.reduce((sum, val) => sum + val, 0) / state.energyWindow.length; +} + +// Update the threshold from the slider +function updateThreshold() { + state.silenceThreshold = parseFloat(elements.thresholdSlider.value); + elements.thresholdValue.textContent = state.silenceThreshold.toFixed(3); +} + +// Update the volume meter display +function updateVolumeMeter() { + if (!state.isStreaming || !state.analyser) return; + + // Get current volume level + const dataArray = new Uint8Array(state.analyser.frequencyBinCount); + state.analyser.getByteFrequencyData(dataArray); + + // Calculate average volume + let sum = 0; + for (let i = 0; i < dataArray.length; i++) { + sum += dataArray[i]; + } + const average = sum / dataArray.length; + + // Normalize to 0-100% + const percentage = Math.min(100, Math.max(0, average / 128 * 100)); + + // Invert because we're showing the "empty" portion + elements.volumeLevel.style.width = (100 - percentage) + '%'; + + // Change color based on level + if (percentage > 70) { + elements.volumeLevel.style.backgroundColor = 'rgba(244, 67, 54, 0.5)'; // Red + } else if (percentage > 30) { + elements.volumeLevel.style.backgroundColor = 'rgba(255, 235, 59, 0.5)'; // Yellow + } else { + elements.volumeLevel.style.backgroundColor = 'rgba(0, 0, 0, 0.5)'; // Dark + } +} + +// Handle speech/silence state transitions +function handleSpeechState(isSilent) { + if (state.isSpeaking && isSilent) { + // Transition from speaking to silence + if (!state.silenceTimer) { + state.silenceTimer = setTimeout(() => { + // Silence persisted long enough - process the audio + elements.streamButton.innerHTML = ' Processing...'; + elements.streamButton.classList.remove('recording'); + elements.streamButton.classList.add('processing'); + addSystemMessage('Detected pause in speech, processing response...'); + }, CLIENT_SILENCE_DURATION_MS); + } + } else if (!state.isSpeaking && !isSilent) { + // Transition from silence to speaking + state.isSpeaking = true; + elements.streamButton.innerHTML = ' Listening...'; + elements.streamButton.classList.add('recording'); + elements.streamButton.classList.remove('processing'); + + // Clear silence timer + if (state.silenceTimer) { + clearTimeout(state.silenceTimer); + state.silenceTimer = null; + } + } else if (state.isSpeaking && !isSilent) { + // Still speaking, reset silence timer + if (state.silenceTimer) { + clearTimeout(state.silenceTimer); + state.silenceTimer = null; + } + } + + // Update speaking state for non-silent audio + if (!isSilent) { + state.isSpeaking = true; + } +} + +// Send audio chunk to server +function sendAudioChunk(audioData, speaker) { + if (!state.socket || !state.socket.connected) { + console.warn('Cannot send audio: socket not connected'); + return; + } + + const wavData = createWavBlob(audioData, 24000); + const reader = new FileReader(); + + reader.onloadend = function() { + const base64data = reader.result; + + // Send to server using Socket.IO + state.socket.emit('stream_audio', { + speaker: speaker, + audio: base64data + }); + }; + + reader.readAsDataURL(wavData); +} + +// Draw audio visualizer +function drawVisualizer() { + if (!canvasContext) { + return; + } + + state.visualizerAnimationFrame = requestAnimationFrame(drawVisualizer); + + // Skip drawing if visualizer is hidden + if (!elements.showVisualizer.checked) { + if (elements.visualizerCanvas.style.opacity !== '0') { + elements.visualizerCanvas.style.opacity = '0'; + } + return; + } else if (elements.visualizerCanvas.style.opacity !== '1') { + elements.visualizerCanvas.style.opacity = '1'; + } + + // Get frequency data if available + if (state.isStreaming && state.analyser) { + try { + state.analyser.getByteFrequencyData(visualizerDataArray); + } catch (e) { + console.error("Error getting frequency data:", e); + } + } else { + // Fade out when not streaming + for (let i = 0; i < visualizerDataArray.length; i++) { + visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5); + } + } + + // Clear canvas + canvasContext.fillStyle = 'rgb(0, 0, 0)'; + canvasContext.fillRect(0, 0, elements.visualizerCanvas.width, elements.visualizerCanvas.height); + + // Draw gradient bars + const width = elements.visualizerCanvas.width; + const height = elements.visualizerCanvas.height; + const barCount = Math.min(visualizerBufferLength, 64); + const barWidth = width / barCount - 1; + + for (let i = 0; i < barCount; i++) { + const index = Math.floor(i * visualizerBufferLength / barCount); + const value = visualizerDataArray[index]; + + // Use logarithmic scale for better audio visualization + // This makes low values more visible while still maintaining full range + const logFactor = 20; + const scaledValue = Math.log(1 + (value / 255) * logFactor) / Math.log(1 + logFactor); + const barHeight = scaledValue * height; + + // Position bars + const x = i * (barWidth + 1); + const y = height - barHeight; + + // Create color gradient based on frequency and amplitude + const hue = i / barCount * 360; // Full color spectrum + const saturation = 80 + (value / 255 * 20); // Higher values more saturated + const lightness = 40 + (value / 255 * 20); // Dynamic brightness based on amplitude + + // Draw main bar + canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`; + canvasContext.fillRect(x, y, barWidth, barHeight); + + // Add reflection effect + if (barHeight > 5) { + const gradient = canvasContext.createLinearGradient( + x, y, + x, y + barHeight * 0.5 + ); + gradient.addColorStop(0, `hsla(${hue}, ${saturation}%, ${lightness + 20}%, 0.4)`); + gradient.addColorStop(1, `hsla(${hue}, ${saturation}%, ${lightness}%, 0)`); + canvasContext.fillStyle = gradient; + canvasContext.fillRect(x, y, barWidth, barHeight * 0.5); + + // Add highlight on top of the bar for better 3D effect + canvasContext.fillStyle = `hsla(${hue}, ${saturation - 20}%, ${lightness + 30}%, 0.7)`; + canvasContext.fillRect(x, y, barWidth, 2); + } + } + + // Show/hide the label + elements.visualizerLabel.style.opacity = (state.isStreaming) ? '0' : '0.7'; +} + +// Toggle visualizer visibility +function toggleVisualizerVisibility() { + const isVisible = elements.showVisualizer.checked; + elements.visualizerCanvas.style.opacity = isVisible ? '1' : '0'; + + if (isVisible && state.isStreaming && !state.visualizerAnimationFrame) { + drawVisualizer(); + } +} + +// Handle audio response from server +function handleAudioResponse(data) { + console.log('Received audio response'); + + // Create message container + const messageElement = document.createElement('div'); + messageElement.className = 'message ai'; + + // Add text content if available + if (data.text) { + const textElement = document.createElement('p'); + textElement.textContent = data.text; + messageElement.appendChild(textElement); + } + + // Create and configure audio element + const audioElement = document.createElement('audio'); + audioElement.controls = true; + audioElement.className = 'audio-player'; + + // Set audio source + const audioSource = document.createElement('source'); + audioSource.src = data.audio; + audioSource.type = 'audio/wav'; + + // Add fallback text + audioElement.textContent = 'Your browser does not support the audio element.'; + + // Assemble audio element + audioElement.appendChild(audioSource); + messageElement.appendChild(audioElement); + + // Add timestamp + const timeElement = document.createElement('span'); + timeElement.className = 'message-time'; + timeElement.textContent = new Date().toLocaleTimeString(); + messageElement.appendChild(timeElement); + + // Add to conversation + elements.conversation.appendChild(messageElement); + + // Auto-scroll to bottom + elements.conversation.scrollTop = elements.conversation.scrollHeight; + + // Auto-play if enabled + if (elements.autoPlayResponses.checked) { + audioElement.play() + .catch(err => { + console.warn('Auto-play failed:', err); + addSystemMessage('Auto-play failed. Please click play to hear the response.'); + }); + } + + // Re-enable stream button after processing is complete + if (state.isStreaming) { + elements.streamButton.innerHTML = ' Listening...'; + elements.streamButton.classList.add('recording'); + elements.streamButton.classList.remove('processing'); + } +} + +// Handle transcription response from server +function handleTranscription(data) { + console.log('Received transcription:', data.text); + + // Create message element + const messageElement = document.createElement('div'); + messageElement.className = 'message user'; + + // Add text content + const textElement = document.createElement('p'); + textElement.textContent = data.text; + messageElement.appendChild(textElement); + + // Add timestamp + const timeElement = document.createElement('span'); + timeElement.className = 'message-time'; + timeElement.textContent = new Date().toLocaleTimeString(); + messageElement.appendChild(timeElement); + + // Add to conversation + elements.conversation.appendChild(messageElement); + + // Auto-scroll to bottom + elements.conversation.scrollTop = elements.conversation.scrollHeight; +} + +// Handle context update from server +function handleContextUpdate(data) { + console.log('Context updated:', data.message); +} + +// Handle streaming status updates from server +function handleStreamingStatus(data) { + console.log('Streaming status:', data.status); + + if (data.status === 'stopped') { + // Reset UI if needed + if (state.isStreaming) { + stopStreaming(false); // Don't send to server since this came from server + } + } +} + +// Add a system message to the conversation +function addSystemMessage(message) { + const messageElement = document.createElement('div'); + messageElement.className = 'message system'; + messageElement.textContent = message; + elements.conversation.appendChild(messageElement); + + // Auto-scroll to bottom + elements.conversation.scrollTop = elements.conversation.scrollHeight; +} + +// Create WAV blob from audio data +function createWavBlob(audioData, sampleRate) { + // Function to convert Float32Array to Int16Array for WAV format + function floatTo16BitPCM(output, offset, input) { + for (let i = 0; i < input.length; i++, offset += 2) { + const s = Math.max(-1, Math.min(1, input[i])); + output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); + } + } + + // Create WAV header + function writeString(view, offset, string) { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + } + + // Create WAV file with header + function encodeWAV(samples) { + const buffer = new ArrayBuffer(44 + samples.length * 2); + const view = new DataView(buffer); + + // RIFF chunk descriptor + writeString(view, 0, 'RIFF'); + view.setUint32(4, 36 + samples.length * 2, true); + writeString(view, 8, 'WAVE'); + + // fmt sub-chunk + writeString(view, 12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); // PCM format + view.setUint16(22, 1, true); // Mono channel + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); // Byte rate + view.setUint16(32, 2, true); // Block align + view.setUint16(34, 16, true); // Bits per sample + + // data sub-chunk + writeString(view, 36, 'data'); + view.setUint32(40, samples.length * 2, true); + floatTo16BitPCM(view, 44, samples); + + return buffer; + } + + // Convert audio data to TypedArray if it's a regular Array + const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData; + + // Create WAV blob + const wavBuffer = encodeWAV(samples); + return new Blob([wavBuffer], { type: 'audio/wav' }); +} + +// Downsample audio buffer to target sample rate +function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) { + if (originalSampleRate === targetSampleRate) { + return buffer; + } + + const ratio = originalSampleRate / targetSampleRate; + const newLength = Math.round(buffer.length / ratio); + const result = new Float32Array(newLength); + + for (let i = 0; i < newLength; i++) { + const pos = Math.round(i * ratio); + result[i] = buffer[pos]; + } + + return result; +} + +// Initialize the application when DOM is fully loaded +document.addEventListener('DOMContentLoaded', initializeApp); + From 9ca259aab3e6c16060f8a2343db5bd76b50230ad Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sat, 29 Mar 2025 23:22:45 -0400 Subject: [PATCH 08/16] Demo Update 2 --- Backend/index.html | 2 +- Backend/voice-chat.js | 708 ++++++++++++++++++++++-------------------- 2 files changed, 374 insertions(+), 336 deletions(-) diff --git a/Backend/index.html b/Backend/index.html index cbb4172..5ea925c 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -487,6 +487,6 @@ - + \ No newline at end of file diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js index 0c8a815..a4e10f5 100644 --- a/Backend/voice-chat.js +++ b/Backend/voice-chat.js @@ -1,388 +1,445 @@ /** - * Sesame AI Voice Chat Application + * Sesame AI Voice Chat Client * - * This script handles the audio streaming, visualization, - * and Socket.IO communication for the voice chat application. + * A web client that connects to a Sesame AI voice chat server and enables + * real-time voice conversation with an AI assistant. */ +// Configuration constants +const SERVER_URL = window.location.hostname === 'localhost' ? + 'http://localhost:5000' : window.location.origin; +const ENERGY_WINDOW_SIZE = 15; +const CLIENT_SILENCE_DURATION_MS = 750; + +// DOM elements +const elements = { + conversation: null, + streamButton: null, + clearButton: null, + thresholdSlider: null, + thresholdValue: null, + visualizerCanvas: null, + visualizerLabel: null, + volumeLevel: null, + statusDot: null, + statusText: null, + speakerSelection: null, + autoPlayResponses: null, + showVisualizer: null +}; + // Application state const state = { socket: null, audioContext: null, - streamProcessor: null, analyser: null, microphone: null, + streamProcessor: null, isStreaming: false, isSpeaking: false, - silenceTimer: null, - energyWindow: [], - currentSpeaker: 0, silenceThreshold: 0.01, - visualizerAnimationFrame: null, + energyWindow: [], + silenceTimer: null, volumeUpdateInterval: null, - connectionAttempts: 0 + visualizerAnimationFrame: null, + currentSpeaker: 0 }; -// Constants -const ENERGY_WINDOW_SIZE = 10; -const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second of silence before processing -const MAX_CONNECTION_ATTEMPTS = 5; -const RECONNECTION_DELAY_MS = 2000; - -// DOM elements -const elements = { - conversation: document.getElementById('conversation'), - speakerSelect: document.getElementById('speakerSelect'), - streamButton: document.getElementById('streamButton'), - clearButton: document.getElementById('clearButton'), - statusDot: document.getElementById('statusDot'), - statusText: document.getElementById('statusText'), - visualizerCanvas: document.getElementById('audioVisualizer'), - visualizerLabel: document.getElementById('visualizerLabel'), - thresholdSlider: document.getElementById('thresholdSlider'), - thresholdValue: document.getElementById('thresholdValue'), - volumeLevel: document.getElementById('volumeLevel'), - autoPlayResponses: document.getElementById('autoPlayResponses'), - showVisualizer: document.getElementById('showVisualizer') -}; - -// Visualization variables -let canvasContext; -let visualizerBufferLength; -let visualizerDataArray; +// Visualizer variables +let canvasContext = null; +let visualizerBufferLength = 0; +let visualizerDataArray = null; // Initialize the application function initializeApp() { - // Set up event listeners - elements.streamButton.addEventListener('click', toggleStreaming); - elements.clearButton.addEventListener('click', clearConversation); - elements.thresholdSlider.addEventListener('input', updateThreshold); - elements.speakerSelect.addEventListener('change', () => { - state.currentSpeaker = parseInt(elements.speakerSelect.value); - }); - elements.showVisualizer.addEventListener('change', toggleVisualizerVisibility); - - // Initialize audio context - setupAudioContext(); + // Initialize the UI elements + initializeUIElements(); - // Set up visualization + // Initialize socket.io connection + setupSocketConnection(); + + // Setup event listeners + setupEventListeners(); + + // Initialize visualizer setupVisualizer(); - // Connect to Socket.IO server - connectToServer(); - - // Add welcome message - addSystemMessage('Welcome to Sesame AI Voice Chat! Click "Start Conversation" to begin speaking.'); + // Show welcome message + addSystemMessage('Welcome to Sesame AI Voice Chat! Click "Start Conversation" to begin.'); } -// Connect to Socket.IO server -function connectToServer() { - try { - // Use the server URL with or without a specific port - const serverUrl = window.location.origin; +// Initialize UI elements +function initializeUIElements() { + // Main UI containers + const chatContainer = document.querySelector('.chat-container'); + const controlPanel = document.querySelector('.control-panel'); + + // Create conversation section + chatContainer.innerHTML = ` +
+

Conversation

+
+
+ Disconnected +
+
+
+ `; + + // Create control panel + controlPanel.innerHTML = ` +
+
+ +
Speak to see audio visualization
+
+
- updateStatus('Connecting...', 'connecting'); - console.log(`Connecting to Socket.IO server at ${serverUrl}`); - - state.socket = io(serverUrl, { - reconnectionDelay: RECONNECTION_DELAY_MS, - reconnectionDelayMax: 5000, - reconnectionAttempts: MAX_CONNECTION_ATTEMPTS - }); - - setupSocketListeners(); - } catch (error) { - console.error('Error connecting to server:', error); - updateStatus('Connection failed. Retrying...', 'error'); - - // Try to reconnect - if (state.connectionAttempts < MAX_CONNECTION_ATTEMPTS) { - state.connectionAttempts++; - setTimeout(connectToServer, RECONNECTION_DELAY_MS); - } else { - updateStatus('Could not connect to server', 'error'); - addSystemMessage('Failed to connect to the server. Please check your connection and refresh the page.'); - } - } +
+
+
Voice Controls
+ +
+
+
+ +
+
+ Silence Threshold + 0.01 +
+ +
+ + + +
+ + +
+
+ +
+
Settings
+ +
+
+ + +
+ +
+ + +
+
+
+
+ `; + + // Store references to UI elements + elements.conversation = document.querySelector('.conversation'); + elements.streamButton = document.getElementById('streamButton'); + elements.clearButton = document.getElementById('clearButton'); + elements.thresholdSlider = document.getElementById('thresholdSlider'); + elements.thresholdValue = document.getElementById('thresholdValue'); + elements.visualizerCanvas = document.getElementById('audioVisualizer'); + elements.visualizerLabel = document.querySelector('.visualizer-label'); + elements.volumeLevel = document.querySelector('.volume-level'); + elements.statusDot = document.querySelector('.status-dot'); + elements.statusText = document.querySelector('.status-text'); + elements.speakerSelection = document.getElementById('speakerSelection'); + elements.autoPlayResponses = document.getElementById('autoPlayResponses'); + elements.showVisualizer = document.getElementById('showVisualizer'); } -// Set up Socket.IO event listeners -function setupSocketListeners() { - if (!state.socket) return; +// Setup Socket.IO connection +function setupSocketConnection() { + state.socket = io(SERVER_URL); + // Connection events state.socket.on('connect', () => { - console.log('Connected to Socket.IO server'); - updateStatus('Connected', 'connected'); - state.connectionAttempts = 0; - elements.streamButton.disabled = false; - addSystemMessage('Connected to server'); + console.log('Connected to server'); + updateConnectionStatus(true); }); state.socket.on('disconnect', () => { - console.log('Disconnected from Socket.IO server'); - updateStatus('Disconnected', 'disconnected'); + console.log('Disconnected from server'); + updateConnectionStatus(false); // Stop streaming if active if (state.isStreaming) { - stopStreaming(false); // false = don't send to server + stopStreaming(false); } - - elements.streamButton.disabled = true; - addSystemMessage('Disconnected from server. Trying to reconnect...'); - }); - - state.socket.on('status', (data) => { - console.log('Status:', data); - addSystemMessage(data.message); }); state.socket.on('error', (data) => { - console.error('Server error:', data); + console.error('Socket error:', data.message); addSystemMessage(`Error: ${data.message}`); }); + // Register message handlers state.socket.on('audio_response', handleAudioResponse); state.socket.on('transcription', handleTranscription); state.socket.on('context_updated', handleContextUpdate); state.socket.on('streaming_status', handleStreamingStatus); +} + +// Setup event listeners +function setupEventListeners() { + // Stream button + elements.streamButton.addEventListener('click', toggleStreaming); - state.socket.on('connect_error', (error) => { - console.error('Connection error:', error); - updateStatus('Connection Error', 'error'); + // Clear button + elements.clearButton.addEventListener('click', clearConversation); + + // Threshold slider + elements.thresholdSlider.addEventListener('input', updateThreshold); + + // Speaker selection + elements.speakerSelection.addEventListener('change', () => { + state.currentSpeaker = parseInt(elements.speakerSelection.value, 10); }); -} - -// Update the connection status in the UI -function updateStatus(message, status) { - elements.statusText.textContent = message; - elements.statusDot.className = 'status-dot'; - if (status === 'connected') { - elements.statusDot.classList.add('active'); - } else if (status === 'connecting') { - elements.statusDot.style.backgroundColor = '#FFA500'; - } else if (status === 'error') { - elements.statusDot.style.backgroundColor = '#F44336'; - } + // Visualizer toggle + elements.showVisualizer.addEventListener('change', toggleVisualizerVisibility); } -// Set up audio context -function setupAudioContext() { - try { - state.audioContext = new (window.AudioContext || window.webkitAudioContext)(); - console.log('Audio context initialized'); - } catch (err) { - console.error('Error setting up audio context:', err); - addSystemMessage(`Audio context error: ${err.message}`); - elements.streamButton.disabled = true; - } -} - -// Set up audio visualizer +// Setup audio visualizer function setupVisualizer() { + if (!elements.visualizerCanvas) return; + canvasContext = elements.visualizerCanvas.getContext('2d'); - // Set canvas size to match container - function resizeCanvas() { - const container = elements.visualizerCanvas.parentElement; - elements.visualizerCanvas.width = container.clientWidth; - elements.visualizerCanvas.height = container.clientHeight; - } + // Set canvas dimensions + elements.visualizerCanvas.width = elements.visualizerCanvas.offsetWidth; + elements.visualizerCanvas.height = elements.visualizerCanvas.offsetHeight; - // Call initially and on window resize - resizeCanvas(); - window.addEventListener('resize', resizeCanvas); - - // Create placeholder data array - visualizerBufferLength = 128; - visualizerDataArray = new Uint8Array(visualizerBufferLength); + // Initialize the visualizer + drawVisualizer(); } -// Toggle stream on/off +// Update connection status UI +function updateConnectionStatus(isConnected) { + elements.statusDot.classList.toggle('active', isConnected); + elements.statusText.textContent = isConnected ? 'Connected' : 'Disconnected'; +} + +// Toggle streaming state function toggleStreaming() { if (state.isStreaming) { - stopStreaming(true); // true = send to server + stopStreaming(true); } else { startStreaming(); } } // Start streaming audio to the server -async function startStreaming() { - if (!state.socket || !state.socket.connected) { - addSystemMessage('Cannot start conversation: Not connected to server'); - return; - } +function startStreaming() { + if (state.isStreaming) return; - try { - // Request microphone access - const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - - // Update state - state.isStreaming = true; - state.isSpeaking = false; - state.energyWindow = []; - state.currentSpeaker = parseInt(elements.speakerSelect.value); - - // Update UI - elements.streamButton.innerHTML = ' Listening...'; - elements.streamButton.classList.add('recording'); - elements.visualizerLabel.style.opacity = '0'; - - // Set up audio processing - setupAudioProcessing(stream); - - // Start volume meter updates - state.volumeUpdateInterval = setInterval(updateVolumeMeter, 100); - - addSystemMessage('Listening - speak naturally and pause when finished'); - - } catch (err) { - console.error('Error starting audio stream:', err); - addSystemMessage(`Microphone error: ${err.message}`); - cleanupAudioResources(); - } -} - -// Set up audio processing pipeline -function setupAudioProcessing(stream) { - // Store microphone stream for later cleanup - state.microphone = stream; - - // Create source from microphone - const source = state.audioContext.createMediaStreamSource(stream); - - // Setup analyzer for visualization - state.analyser = state.audioContext.createAnalyser(); - state.analyser.fftSize = 256; - state.analyser.smoothingTimeConstant = 0.8; - state.analyser.minDecibels = -90; - state.analyser.maxDecibels = -10; - - visualizerBufferLength = state.analyser.frequencyBinCount; - visualizerDataArray = new Uint8Array(visualizerBufferLength); - - // Connect source to analyzer - source.connect(state.analyser); - - // Start visualization - if (state.visualizerAnimationFrame) { - cancelAnimationFrame(state.visualizerAnimationFrame); - } - drawVisualizer(); - - // Setup audio processor - state.streamProcessor = state.audioContext.createScriptProcessor(4096, 1, 1); - - // Connect audio nodes - source.connect(state.streamProcessor); - state.streamProcessor.connect(state.audioContext.destination); - - // Process audio - state.streamProcessor.onaudioprocess = handleAudioProcess; -} - -// Handle each frame of audio data -function handleAudioProcess(e) { - const audioData = e.inputBuffer.getChannelData(0); - - // Calculate energy (volume) for silence detection - const energy = calculateAudioEnergy(audioData); - updateEnergyWindow(energy); - - // Check if currently silent - const avgEnergy = calculateAverageEnergy(); - const isSilent = avgEnergy < state.silenceThreshold; - - // Handle silence/speech transitions - handleSpeechState(isSilent); - - // Process and send audio - const downsampled = downsampleBuffer(audioData, state.audioContext.sampleRate, 24000); - sendAudioChunk(downsampled, state.currentSpeaker); + // Request microphone access + navigator.mediaDevices.getUserMedia({ audio: true, video: false }) + .then(stream => { + // Show processing state while setting up + elements.streamButton.innerHTML = ' Initializing...'; + + // Create audio context + state.audioContext = new (window.AudioContext || window.webkitAudioContext)(); + + // Create microphone source + state.microphone = state.audioContext.createMediaStreamSource(stream); + + // Create analyser for visualizer + state.analyser = state.audioContext.createAnalyser(); + state.analyser.fftSize = 256; + visualizerBufferLength = state.analyser.frequencyBinCount; + visualizerDataArray = new Uint8Array(visualizerBufferLength); + + // Connect microphone to analyser + state.microphone.connect(state.analyser); + + // Create script processor for audio processing + const bufferSize = 4096; + state.streamProcessor = state.audioContext.createScriptProcessor(bufferSize, 1, 1); + + // Set up audio processing callback + state.streamProcessor.onaudioprocess = handleAudioProcess; + + // Connect the processors + state.analyser.connect(state.streamProcessor); + state.streamProcessor.connect(state.audioContext.destination); + + // Update UI + state.isStreaming = true; + elements.streamButton.innerHTML = ' Listening...'; + elements.streamButton.classList.add('recording'); + + // Initialize energy window + state.energyWindow = []; + + // Start volume meter updates + state.volumeUpdateInterval = setInterval(updateVolumeMeter, 100); + + // Start visualizer if enabled + if (elements.showVisualizer.checked && !state.visualizerAnimationFrame) { + drawVisualizer(); + } + + // Show starting message + addSystemMessage('Listening... Speak clearly into your microphone.'); + + // Notify the server that we're starting + state.socket.emit('stream_audio', { + audio: '', + speaker: state.currentSpeaker + }); + }) + .catch(err => { + console.error('Error accessing microphone:', err); + addSystemMessage(`Error: ${err.message}. Please make sure your microphone is connected and you've granted permission.`); + elements.streamButton.innerHTML = ' Start Conversation'; + }); } // Stop streaming audio -function stopStreaming(sendToServer = true) { - // Cleanup audio resources - cleanupAudioResources(); +function stopStreaming(notifyServer = true) { + if (!state.isStreaming) return; - // Reset state - state.isStreaming = false; - state.isSpeaking = false; - state.energyWindow = []; - - // Update UI + // Update UI first elements.streamButton.innerHTML = ' Start Conversation'; - elements.streamButton.classList.remove('recording', 'processing'); - elements.streamButton.style.backgroundColor = ''; - elements.volumeLevel.style.width = '100%'; + elements.streamButton.classList.remove('recording'); + elements.streamButton.classList.remove('processing'); - // Clear volume meter updates + // Stop volume meter updates if (state.volumeUpdateInterval) { clearInterval(state.volumeUpdateInterval); state.volumeUpdateInterval = null; } - addSystemMessage('Conversation paused'); - - // Notify server - if (sendToServer && state.socket && state.socket.connected) { - state.socket.emit('stop_streaming', { - speaker: state.currentSpeaker - }); - } -} - -// Clean up audio processing resources -function cleanupAudioResources() { - // Stop microphone stream - if (state.microphone) { - state.microphone.getTracks().forEach(track => track.stop()); - state.microphone = null; - } - - // Disconnect audio processor + // Stop all audio processing if (state.streamProcessor) { state.streamProcessor.disconnect(); - state.streamProcessor.onaudioprocess = null; state.streamProcessor = null; } - // Disconnect analyzer if (state.analyser) { state.analyser.disconnect(); - state.analyser = null; } - // Cancel visualizer animation + if (state.microphone) { + state.microphone.disconnect(); + } + + // Close audio context + if (state.audioContext && state.audioContext.state !== 'closed') { + state.audioContext.close().catch(err => console.warn('Error closing audio context:', err)); + } + + // Cleanup animation frames + if (state.visualizerAnimationFrame) { + cancelAnimationFrame(state.visualizerAnimationFrame); + state.visualizerAnimationFrame = null; + } + + // Reset state + state.isStreaming = false; + state.isSpeaking = false; + + // Notify the server + if (notifyServer && state.socket && state.socket.connected) { + state.socket.emit('stop_streaming', { + speaker: state.currentSpeaker + }); + } + + // Show message + addSystemMessage('Conversation paused. Click "Start Conversation" to resume.'); +} + +// Handle audio processing +function handleAudioProcess(event) { + const inputData = event.inputBuffer.getChannelData(0); + + // Calculate audio energy (volume level) + const energy = calculateAudioEnergy(inputData); + + // Update energy window for averaging + updateEnergyWindow(energy); + + // Calculate average energy + const avgEnergy = calculateAverageEnergy(); + + // Determine if audio is silent + const isSilent = avgEnergy < state.silenceThreshold; + + // Handle speech state based on silence + handleSpeechState(isSilent); + + // Only send audio chunk if we detect speech + if (!isSilent) { + // Create a resampled version at 24kHz for the server + // Most WebRTC audio is 48kHz, but we want 24kHz for the model + const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000); + + // Send the audio chunk to the server + sendAudioChunk(resampledData, state.currentSpeaker); + } +} + +// Cleanup audio resources when done +function cleanupAudioResources() { + // Stop all audio processing + if (state.streamProcessor) { + state.streamProcessor.disconnect(); + state.streamProcessor = null; + } + + if (state.analyser) { + state.analyser.disconnect(); + state.analyser = null; + } + + if (state.microphone) { + state.microphone.disconnect(); + state.microphone = null; + } + + // Close audio context + if (state.audioContext && state.audioContext.state !== 'closed') { + state.audioContext.close().catch(err => console.warn('Error closing audio context:', err)); + } + + // Cancel all timers and animation frames + if (state.volumeUpdateInterval) { + clearInterval(state.volumeUpdateInterval); + state.volumeUpdateInterval = null; + } + if (state.visualizerAnimationFrame) { cancelAnimationFrame(state.visualizerAnimationFrame); state.visualizerAnimationFrame = null; } - // Cancel silence timer if (state.silenceTimer) { clearTimeout(state.silenceTimer); state.silenceTimer = null; } - - // Reset visualizer display - if (canvasContext) { - canvasContext.clearRect(0, 0, elements.visualizerCanvas.width, elements.visualizerCanvas.height); - elements.visualizerLabel.style.opacity = '0.7'; - } } // Clear conversation history function clearConversation() { - // Clear UI - elements.conversation.innerHTML = ''; - addSystemMessage('Conversation cleared'); - - // Notify server - if (state.socket && state.socket.connected) { - state.socket.emit('clear_context'); + if (elements.conversation) { + elements.conversation.innerHTML = ''; + addSystemMessage('Conversation cleared.'); + + // Notify server to clear context + if (state.socket && state.socket.connected) { + state.socket.emit('clear_context'); + } } } @@ -390,9 +447,9 @@ function clearConversation() { function calculateAudioEnergy(buffer) { let sum = 0; for (let i = 0; i < buffer.length; i++) { - sum += Math.abs(buffer[i]); + sum += buffer[i] * buffer[i]; } - return sum / buffer.length; + return Math.sqrt(sum / buffer.length); } // Update energy window for averaging @@ -406,7 +463,9 @@ function updateEnergyWindow(energy) { // Calculate average energy from window function calculateAverageEnergy() { if (state.energyWindow.length === 0) return 0; - return state.energyWindow.reduce((sum, val) => sum + val, 0) / state.energyWindow.length; + + const sum = state.energyWindow.reduce((a, b) => a + b, 0); + return sum / state.energyWindow.length; } // Update the threshold from the slider @@ -417,32 +476,26 @@ function updateThreshold() { // Update the volume meter display function updateVolumeMeter() { - if (!state.isStreaming || !state.analyser) return; + if (!state.isStreaming || !state.energyWindow.length) return; - // Get current volume level - const dataArray = new Uint8Array(state.analyser.frequencyBinCount); - state.analyser.getByteFrequencyData(dataArray); + const avgEnergy = calculateAverageEnergy(); - // Calculate average volume - let sum = 0; - for (let i = 0; i < dataArray.length; i++) { - sum += dataArray[i]; - } - const average = sum / dataArray.length; + // Scale energy to percentage (0-100) + // Typically, energy values will be very small (e.g., 0.001 to 0.1) + // So we multiply by a factor to make it more visible + const scaleFactor = 1000; + const percentage = Math.min(100, Math.max(0, avgEnergy * scaleFactor)); - // Normalize to 0-100% - const percentage = Math.min(100, Math.max(0, average / 128 * 100)); - - // Invert because we're showing the "empty" portion - elements.volumeLevel.style.width = (100 - percentage) + '%'; + // Update volume meter width + elements.volumeLevel.style.width = `${percentage}%`; // Change color based on level if (percentage > 70) { - elements.volumeLevel.style.backgroundColor = 'rgba(244, 67, 54, 0.5)'; // Red + elements.volumeLevel.style.backgroundColor = '#ff5252'; } else if (percentage > 30) { - elements.volumeLevel.style.backgroundColor = 'rgba(255, 235, 59, 0.5)'; // Yellow + elements.volumeLevel.style.backgroundColor = '#4CAF50'; } else { - elements.volumeLevel.style.backgroundColor = 'rgba(0, 0, 0, 0.5)'; // Dark + elements.volumeLevel.style.backgroundColor = '#4c84ff'; } } @@ -452,31 +505,16 @@ function handleSpeechState(isSilent) { // Transition from speaking to silence if (!state.silenceTimer) { state.silenceTimer = setTimeout(() => { - // Silence persisted long enough - process the audio - elements.streamButton.innerHTML = ' Processing...'; - elements.streamButton.classList.remove('recording'); - elements.streamButton.classList.add('processing'); - addSystemMessage('Detected pause in speech, processing response...'); + // Only consider it a real silence after a certain duration + // This prevents detecting brief pauses as the end of speech + state.isSpeaking = false; + state.silenceTimer = null; }, CLIENT_SILENCE_DURATION_MS); } - } else if (!state.isSpeaking && !isSilent) { - // Transition from silence to speaking - state.isSpeaking = true; - elements.streamButton.innerHTML = ' Listening...'; - elements.streamButton.classList.add('recording'); - elements.streamButton.classList.remove('processing'); - - // Clear silence timer - if (state.silenceTimer) { - clearTimeout(state.silenceTimer); - state.silenceTimer = null; - } - } else if (state.isSpeaking && !isSilent) { - // Still speaking, reset silence timer - if (state.silenceTimer) { - clearTimeout(state.silenceTimer); - state.silenceTimer = null; - } + } else if (state.silenceTimer && !isSilent) { + // User started speaking again, cancel the silence timer + clearTimeout(state.silenceTimer); + state.silenceTimer = null; } // Update speaking state for non-silent audio @@ -488,7 +526,7 @@ function handleSpeechState(isSilent) { // Send audio chunk to server function sendAudioChunk(audioData, speaker) { if (!state.socket || !state.socket.connected) { - console.warn('Cannot send audio: socket not connected'); + console.warn('Socket not connected'); return; } @@ -498,10 +536,10 @@ function sendAudioChunk(audioData, speaker) { reader.onloadend = function() { const base64data = reader.result; - // Send to server using Socket.IO + // Send the audio chunk to the server state.socket.emit('stream_audio', { - speaker: speaker, - audio: base64data + audio: base64data, + speaker: speaker }); }; @@ -531,7 +569,7 @@ function drawVisualizer() { try { state.analyser.getByteFrequencyData(visualizerDataArray); } catch (e) { - console.error("Error getting frequency data:", e); + console.warn('Error getting frequency data:', e); } } else { // Fade out when not streaming From 6a8cc50dac2eb99eb8095e13e261f846bfd7612f Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sat, 29 Mar 2025 23:28:44 -0400 Subject: [PATCH 09/16] serve voice chat js --- Backend/server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Backend/server.py b/Backend/server.py index e986606..4e60aa7 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -160,6 +160,10 @@ def favicon(): return send_from_directory(static_dir, 'favicon.ico') return Response(status=204) +@app.route('/voice-chat.js') +def voice_chat_js(): + return send_from_directory(base_dir, 'voice-chat.js') + @app.route('/static/') def serve_static(path): return send_from_directory(static_dir, path) From b74ae2dbfc449913e669e2c54e76e973ad63eb6f Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sat, 29 Mar 2025 23:43:16 -0400 Subject: [PATCH 10/16] Demo Update 3 --- Backend/server.py | 62 ++++++++-- Backend/voice-chat.js | 275 +++++++++++++++++++++--------------------- 2 files changed, 188 insertions(+), 149 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index 4e60aa7..bacf793 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -55,27 +55,71 @@ active_clients = {} # Map client_id to client context def decode_audio_data(audio_data: str) -> torch.Tensor: """Decode base64 audio data to a torch tensor""" try: + # Skip empty audio data + if not audio_data: + print("Empty audio data received") + return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence + # Extract the actual base64 content if ',' in audio_data: audio_data = audio_data.split(',')[1] - + # Decode base64 audio data - binary_data = base64.b64decode(audio_data) + try: + binary_data = base64.b64decode(audio_data) + print(f"Decoded base64 data: {len(binary_data)} bytes") + except Exception as e: + print(f"Base64 decoding error: {str(e)}") + return torch.zeros(generator.sample_rate // 2) + # Debug: save the raw binary data to examine with external tools + debug_path = os.path.join(base_dir, "debug_incoming.wav") + with open(debug_path, 'wb') as f: + f.write(binary_data) + print(f"Saved debug file to {debug_path}") + # Load audio from binary data - with BytesIO(binary_data) as temp_file: - audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") + try: + with BytesIO(binary_data) as temp_file: + audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") + print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz") + + # Check if audio is valid + if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any(): + print("Warning: Empty or invalid audio data detected") + return torch.zeros(generator.sample_rate // 2) + except Exception as e: + print(f"Audio loading error: {str(e)}") + # Try saving to a temporary file instead of loading from BytesIO + try: + temp_path = os.path.join(base_dir, "temp_incoming.wav") + with open(temp_path, 'wb') as f: + f.write(binary_data) + print(f"Trying to load from file: {temp_path}") + audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav") + print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz") + os.remove(temp_path) + except Exception as e2: + print(f"Secondary audio loading error: {str(e2)}") + return torch.zeros(generator.sample_rate // 2) # Resample if needed if sample_rate != generator.sample_rate: - audio_tensor = torchaudio.functional.resample( - audio_tensor.squeeze(0), - orig_freq=sample_rate, - new_freq=generator.sample_rate - ) + try: + print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz") + audio_tensor = torchaudio.functional.resample( + audio_tensor.squeeze(0), + orig_freq=sample_rate, + new_freq=generator.sample_rate + ) + print(f"Resampled audio shape: {audio_tensor.shape}") + except Exception as e: + print(f"Resampling error: {str(e)}") + return torch.zeros(generator.sample_rate // 2) else: audio_tensor = audio_tensor.squeeze(0) + print(f"Final audio tensor shape: {audio_tensor.shape}") return audio_tensor except Exception as e: print(f"Error decoding audio: {str(e)}") diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js index a4e10f5..c85da8a 100644 --- a/Backend/voice-chat.js +++ b/Backend/voice-chat.js @@ -70,88 +70,18 @@ function initializeApp() { // Initialize UI elements function initializeUIElements() { - // Main UI containers - const chatContainer = document.querySelector('.chat-container'); - const controlPanel = document.querySelector('.control-panel'); - - // Create conversation section - chatContainer.innerHTML = ` -
-

Conversation

-
-
- Disconnected -
-
-
- `; - - // Create control panel - controlPanel.innerHTML = ` -
-
- -
Speak to see audio visualization
-
-
- -
-
-
Voice Controls
- -
-
-
- -
-
- Silence Threshold - 0.01 -
- -
- - - -
- - -
-
- -
-
Settings
- -
-
- - -
- -
- - -
-
-
-
- `; - // Store references to UI elements - elements.conversation = document.querySelector('.conversation'); + elements.conversation = document.getElementById('conversation'); elements.streamButton = document.getElementById('streamButton'); elements.clearButton = document.getElementById('clearButton'); elements.thresholdSlider = document.getElementById('thresholdSlider'); elements.thresholdValue = document.getElementById('thresholdValue'); elements.visualizerCanvas = document.getElementById('audioVisualizer'); - elements.visualizerLabel = document.querySelector('.visualizer-label'); - elements.volumeLevel = document.querySelector('.volume-level'); - elements.statusDot = document.querySelector('.status-dot'); - elements.statusText = document.querySelector('.status-text'); - elements.speakerSelection = document.getElementById('speakerSelection'); + elements.visualizerLabel = document.getElementById('visualizerLabel'); + elements.volumeLevel = document.getElementById('volumeLevel'); + elements.statusDot = document.getElementById('statusDot'); + elements.statusText = document.getElementById('statusText'); + elements.speakerSelection = document.getElementById('speakerSelect'); // Changed to match HTML elements.autoPlayResponses = document.getElementById('autoPlayResponses'); elements.showVisualizer = document.getElementById('showVisualizer'); } @@ -364,8 +294,12 @@ function stopStreaming(notifyServer = true) { function handleAudioProcess(event) { const inputData = event.inputBuffer.getChannelData(0); + // Log audio buffer statistics + console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`); + // Calculate audio energy (volume level) const energy = calculateAudioEnergy(inputData); + console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`); // Update energy window for averaging updateEnergyWindow(energy); @@ -375,6 +309,7 @@ function handleAudioProcess(event) { // Determine if audio is silent const isSilent = avgEnergy < state.silenceThreshold; + console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`); // Handle speech state based on silence handleSpeechState(isSilent); @@ -384,6 +319,7 @@ function handleAudioProcess(event) { // Create a resampled version at 24kHz for the server // Most WebRTC audio is 48kHz, but we want 24kHz for the model const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000); + console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`); // Send the audio chunk to the server sendAudioChunk(resampledData, state.currentSpeaker); @@ -530,20 +466,132 @@ function sendAudioChunk(audioData, speaker) { return; } - const wavData = createWavBlob(audioData, 24000); - const reader = new FileReader(); + console.log(`Creating WAV from audio data: length=${audioData.length}`); - reader.onloadend = function() { - const base64data = reader.result; + // Check for NaN or invalid values + let hasNaN = false; + let min = Infinity; + let max = -Infinity; + let sum = 0; + + for (let i = 0; i < audioData.length; i++) { + if (isNaN(audioData[i]) || !isFinite(audioData[i])) { + hasNaN = true; + console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`); + break; + } + min = Math.min(min, audioData[i]); + max = Math.max(max, audioData[i]); + sum += audioData[i]; + } + + if (hasNaN) { + console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.'); + audioData = new Float32Array(audioData.length).fill(0); + } else { + const avg = sum / audioData.length; + console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`); + } + + try { + // Create WAV blob with proper format + const wavData = createWavBlob(audioData, 24000); + console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`); - // Send the audio chunk to the server - state.socket.emit('stream_audio', { - audio: base64data, - speaker: speaker - }); - }; + const reader = new FileReader(); + + reader.onloadend = function() { + try { + // Get base64 data + const base64data = reader.result; + console.log(`Base64 data created: length=${base64data.length}`); + + // Validate the base64 data before sending + if (!base64data || base64data.length < 100) { + console.warn('Generated base64 data is too small or invalid'); + return; + } + + // Send the audio chunk to the server + console.log('Sending audio data to server...'); + state.socket.emit('stream_audio', { + audio: base64data, + speaker: speaker + }); + console.log('Audio data sent successfully'); + } catch (err) { + console.error('Error preparing audio data:', err); + } + }; + + reader.onerror = function(err) { + console.error('Error reading audio data:', err); + }; + + reader.readAsDataURL(wavData); + } catch (err) { + console.error('Error creating WAV data:', err); + } +} + +// Create WAV blob from audio data with validation +function createWavBlob(audioData, sampleRate) { + // Check if audio data is valid + if (!audioData || audioData.length === 0) { + console.warn('Empty audio data received'); + // Return a tiny silent audio snippet instead + audioData = new Float32Array(100).fill(0); + } - reader.readAsDataURL(wavData); + // Function to convert Float32Array to Int16Array for WAV format + function floatTo16BitPCM(output, offset, input) { + for (let i = 0; i < input.length; i++, offset += 2) { + const s = Math.max(-1, Math.min(1, input[i])); + output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); + } + } + + // Create WAV header + function writeString(view, offset, string) { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + } + + // Create WAV file with header + function encodeWAV(samples) { + const buffer = new ArrayBuffer(44 + samples.length * 2); + const view = new DataView(buffer); + + // RIFF chunk descriptor + writeString(view, 0, 'RIFF'); + view.setUint32(4, 36 + samples.length * 2, true); + writeString(view, 8, 'WAVE'); + + // fmt sub-chunk + writeString(view, 12, 'fmt '); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); // PCM format + view.setUint16(22, 1, true); // Mono channel + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); // Byte rate + view.setUint16(32, 2, true); // Block align + view.setUint16(34, 16, true); // Bits per sample + + // data sub-chunk + writeString(view, 36, 'data'); + view.setUint32(40, samples.length * 2, true); + floatTo16BitPCM(view, 44, samples); + + return buffer; + } + + // Convert audio data to TypedArray if it's a regular Array + const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData; + + // Create WAV blob + const wavBuffer = encodeWAV(samples); + return new Blob([wavBuffer], { type: 'audio/wav' }); } // Draw audio visualizer @@ -757,59 +805,6 @@ function addSystemMessage(message) { elements.conversation.scrollTop = elements.conversation.scrollHeight; } -// Create WAV blob from audio data -function createWavBlob(audioData, sampleRate) { - // Function to convert Float32Array to Int16Array for WAV format - function floatTo16BitPCM(output, offset, input) { - for (let i = 0; i < input.length; i++, offset += 2) { - const s = Math.max(-1, Math.min(1, input[i])); - output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); - } - } - - // Create WAV header - function writeString(view, offset, string) { - for (let i = 0; i < string.length; i++) { - view.setUint8(offset + i, string.charCodeAt(i)); - } - } - - // Create WAV file with header - function encodeWAV(samples) { - const buffer = new ArrayBuffer(44 + samples.length * 2); - const view = new DataView(buffer); - - // RIFF chunk descriptor - writeString(view, 0, 'RIFF'); - view.setUint32(4, 36 + samples.length * 2, true); - writeString(view, 8, 'WAVE'); - - // fmt sub-chunk - writeString(view, 12, 'fmt '); - view.setUint32(16, 16, true); - view.setUint16(20, 1, true); // PCM format - view.setUint16(22, 1, true); // Mono channel - view.setUint32(24, sampleRate, true); - view.setUint32(28, sampleRate * 2, true); // Byte rate - view.setUint16(32, 2, true); // Block align - view.setUint16(34, 16, true); // Bits per sample - - // data sub-chunk - writeString(view, 36, 'data'); - view.setUint32(40, samples.length * 2, true); - floatTo16BitPCM(view, 44, samples); - - return buffer; - } - - // Convert audio data to TypedArray if it's a regular Array - const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData; - - // Create WAV blob - const wavBuffer = encodeWAV(samples); - return new Blob([wavBuffer], { type: 'audio/wav' }); -} - // Downsample audio buffer to target sample rate function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) { if (originalSampleRate === targetSampleRate) { From eef7da454a082220b6d106558baf1f36f69aac73 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sat, 29 Mar 2025 23:54:02 -0400 Subject: [PATCH 11/16] Demo Update 3 --- Backend/server.py | 296 ++++++++++++++++++++++++++++++++++-------- Backend/voice-chat.js | 136 +++++++++++-------- 2 files changed, 320 insertions(+), 112 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index bacf793..b638e99 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -16,6 +16,28 @@ import gc from collections import deque from threading import Lock +# Add these lines right after your imports +import torch +import os + +# Handle CUDA issues +os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only +torch.backends.cudnn.benchmark = True + +# Set CUDA settings to avoid TF32 warnings +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True + +# Set compute type based on available hardware +if torch.cuda.is_available(): + device = "cuda" + compute_type = "float16" # Faster for CUDA +else: + device = "cpu" + compute_type = "int8" # Better for CPU + +print(f"Using device: {device} with compute type: {compute_type}") + # Select device if torch.cuda.is_available(): device = "cuda" @@ -28,9 +50,22 @@ generator = load_csm_1b(device=device) # Initialize WhisperX for ASR print("Loading WhisperX model...") -# Use a smaller model for faster response times -asr_model = whisperx.load_model("medium", device, compute_type="float16") -print("WhisperX model loaded!") +try: + # Try to load a smaller model for faster response times + asr_model = whisperx.load_model("small", device, compute_type=compute_type) + print("WhisperX 'small' model loaded successfully") +except Exception as e: + print(f"Error loading 'small' model: {str(e)}") + try: + # Fall back to tiny model if small fails + asr_model = whisperx.load_model("tiny", device, compute_type=compute_type) + print("WhisperX 'tiny' model loaded as fallback") + except Exception as e2: + print(f"Error loading fallback model: {str(e2)}") + print("Trying CPU model as last resort") + # Last resort - try CPU + asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") + print("WhisperX loaded on CPU as last resort") # Silence detection parameters SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization @@ -53,76 +88,130 @@ active_clients = {} # Map client_id to client context # Helper function to convert audio data def decode_audio_data(audio_data: str) -> torch.Tensor: - """Decode base64 audio data to a torch tensor""" + """Decode base64 audio data to a torch tensor with improved error handling""" try: # Skip empty audio data - if not audio_data: - print("Empty audio data received") + if not audio_data or len(audio_data) < 100: + print("Empty or too short audio data received") return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence # Extract the actual base64 content if ',' in audio_data: + # Handle data URL format (data:audio/wav;base64,...) audio_data = audio_data.split(',')[1] # Decode base64 audio data try: binary_data = base64.b64decode(audio_data) print(f"Decoded base64 data: {len(binary_data)} bytes") + + # Check if we have enough data for a valid WAV + if len(binary_data) < 44: # WAV header is 44 bytes + print("Data too small to be a valid WAV file") + return torch.zeros(generator.sample_rate // 2) except Exception as e: print(f"Base64 decoding error: {str(e)}") return torch.zeros(generator.sample_rate // 2) - # Debug: save the raw binary data to examine with external tools + # Save for debugging debug_path = os.path.join(base_dir, "debug_incoming.wav") with open(debug_path, 'wb') as f: f.write(binary_data) - print(f"Saved debug file to {debug_path}") - - # Load audio from binary data + print(f"Saved debug file: {debug_path}") + + # Approach 1: Load directly with torchaudio try: with BytesIO(binary_data) as temp_file: + temp_file.seek(0) # Ensure we're at the start of the buffer audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") - print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz") + print(f"Direct loading success: shape={audio_tensor.shape}, rate={sample_rate}Hz") # Check if audio is valid if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any(): - print("Warning: Empty or invalid audio data detected") - return torch.zeros(generator.sample_rate // 2) + raise ValueError("Empty or invalid audio tensor detected") except Exception as e: - print(f"Audio loading error: {str(e)}") - # Try saving to a temporary file instead of loading from BytesIO + print(f"Direct loading failed: {str(e)}") + + # Approach 2: Try to fix/normalize the WAV data try: - temp_path = os.path.join(base_dir, "temp_incoming.wav") + # Sometimes WAV headers can be malformed, attempt to fix + temp_path = os.path.join(base_dir, "temp_fixing.wav") with open(temp_path, 'wb') as f: f.write(binary_data) - print(f"Trying to load from file: {temp_path}") - audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav") - print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz") - os.remove(temp_path) + + # Use a simpler numpy approach as backup + import numpy as np + import wave + + try: + with wave.open(temp_path, 'rb') as wf: + n_channels = wf.getnchannels() + sample_width = wf.getsampwidth() + sample_rate = wf.getframerate() + n_frames = wf.getnframes() + + # Read the frames + frames = wf.readframes(n_frames) + print(f"Wave reading: channels={n_channels}, rate={sample_rate}Hz, frames={n_frames}") + + # Convert to numpy and then to torch + if sample_width == 2: # 16-bit audio + data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0 + elif sample_width == 1: # 8-bit audio + data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0 + else: + raise ValueError(f"Unsupported sample width: {sample_width}") + + # Convert to mono if needed + if n_channels > 1: + data = data.reshape(-1, n_channels) + data = data.mean(axis=1) + + # Convert to torch tensor + audio_tensor = torch.from_numpy(data) + print(f"Successfully converted with numpy: shape={audio_tensor.shape}") + except Exception as wave_error: + print(f"Wave processing failed: {str(wave_error)}") + # Try with torchaudio as last resort + audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav") + + # Clean up + if os.path.exists(temp_path): + os.remove(temp_path) except Exception as e2: - print(f"Secondary audio loading error: {str(e2)}") + print(f"All WAV loading methods failed: {str(e2)}") + print("Returning silence as fallback") return torch.zeros(generator.sample_rate // 2) + # Ensure audio is the right shape (mono) + if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1: + audio_tensor = torch.mean(audio_tensor, dim=0) + + # Ensure we have a 1D tensor + audio_tensor = audio_tensor.squeeze() + # Resample if needed if sample_rate != generator.sample_rate: try: print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz") - audio_tensor = torchaudio.functional.resample( - audio_tensor.squeeze(0), + resampler = torchaudio.transforms.Resample( orig_freq=sample_rate, new_freq=generator.sample_rate ) - print(f"Resampled audio shape: {audio_tensor.shape}") + audio_tensor = resampler(audio_tensor) except Exception as e: print(f"Resampling error: {str(e)}") - return torch.zeros(generator.sample_rate // 2) - else: - audio_tensor = audio_tensor.squeeze(0) - - print(f"Final audio tensor shape: {audio_tensor.shape}") + # If resampling fails, just return the original audio + # The model can often handle different sample rates + + # Normalize audio to avoid issues + if torch.abs(audio_tensor).max() > 0: + audio_tensor = audio_tensor / torch.abs(audio_tensor).max() + + print(f"Final audio tensor: shape={audio_tensor.shape}, min={audio_tensor.min().item():.4f}, max={audio_tensor.max().item():.4f}") return audio_tensor except Exception as e: - print(f"Error decoding audio: {str(e)}") + print(f"Unhandled error in decode_audio_data: {str(e)}") # Return a small silent audio segment as fallback return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence @@ -143,6 +232,8 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str: temp_path = os.path.join(base_dir, "temp_audio.wav") torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) + print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)") + # Load and transcribe the audio audio = whisperx.load_audio(temp_path) result = asr_model.transcribe(audio, batch_size=16) @@ -155,11 +246,15 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str: if result["segments"] and len(result["segments"]) > 0: # Combine all segments transcription = " ".join([segment["text"] for segment in result["segments"]]) + print(f"Transcription successful: '{transcription.strip()}'") return transcription.strip() else: + print("Transcription returned no segments") return "" except Exception as e: print(f"Error in transcription: {str(e)}") + import traceback + traceback.print_exc() if os.path.exists("temp_audio.wav"): os.remove("temp_audio.wav") return "" @@ -385,43 +480,73 @@ def handle_stream_audio(data): # Log the transcription print(f"[{client_id}] Transcribed text: '{transcribed_text}'") - # Add to conversation context + # Handle the transcription result if transcribed_text: + # Add user message to context user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) client['context_segments'].append(user_segment) - # Generate a contextual response - response_text = generate_response(transcribed_text, client['context_segments']) - # Send the transcribed text to client emit('transcription', { 'type': 'transcription', 'text': transcribed_text }) - # Generate audio for the response - audio_tensor = generator.generate( - text=response_text, - speaker=1 if speaker_id == 0 else 0, # Use opposite speaker - context=client['context_segments'], - max_audio_length_ms=10_000, - ) + # Generate a contextual response + response_text = generate_response(transcribed_text, client['context_segments']) + print(f"[{client_id}] Generating audio response: '{response_text}'") - # Add response to context - ai_segment = Segment( - text=response_text, - speaker=1 if speaker_id == 0 else 0, - audio=audio_tensor - ) - client['context_segments'].append(ai_segment) - - # Convert audio to base64 and send back to client - audio_base64 = encode_audio_data(audio_tensor) - emit('audio_response', { - 'type': 'audio_response', - 'text': response_text, - 'audio': audio_base64 + # Let the client know we're processing + emit('processing_status', { + 'type': 'processing_status', + 'status': 'generating_audio', + 'message': 'Generating audio response...' }) + + # Generate audio for the response + try: + # Use a different speaker than the user + ai_speaker_id = 1 if speaker_id == 0 else 0 + + # Start audio generation with streaming (chunk by chunk) + audio_chunks = [] + + # This version tries to stream the audio generation in smaller chunks + # Note: CSM model doesn't natively support incremental generation, + # so we're simulating it here for a more responsive UI experience + + # Generate the full response + audio_tensor = generator.generate( + text=response_text, + speaker=ai_speaker_id, + context=client['context_segments'], + max_audio_length_ms=10_000, + ) + + # Add response to context + ai_segment = Segment( + text=response_text, + speaker=ai_speaker_id, + audio=audio_tensor + ) + client['context_segments'].append(ai_segment) + + # Convert audio to base64 and send back to client + audio_base64 = encode_audio_data(audio_tensor) + emit('audio_response', { + 'type': 'audio_response', + 'text': response_text, + 'audio': audio_base64 + }) + + print(f"[{client_id}] Audio response sent: {len(audio_base64)} bytes") + + except Exception as gen_error: + print(f"Error generating audio response: {str(gen_error)}") + emit('error', { + 'type': 'error', + 'message': "Sorry, there was an error generating the audio response." + }) else: # If transcription failed, send a generic response emit('error', { @@ -437,6 +562,7 @@ def handle_stream_audio(data): # If buffer gets too large without silence, process it anyway elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec + print(f"[{client_id}] Processing long audio segment without silence") full_audio = torch.cat(client['streaming_buffer'], dim=0) # Process with WhisperX speech-to-text @@ -453,7 +579,9 @@ def handle_stream_audio(data): 'text': transcribed_text + " (processing continued speech...)" }) - client['streaming_buffer'] = [] + # Keep half of the buffer for context (sliding window approach) + half_point = len(client['streaming_buffer']) // 2 + client['streaming_buffer'] = client['streaming_buffer'][half_point:] except Exception as e: import traceback @@ -497,6 +625,62 @@ def handle_stop_streaming(data): 'status': 'stopped' }) +def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=500): + """Stream audio to client in chunks to simulate real-time generation""" + try: + if client_id not in active_clients: + print(f"Client {client_id} not found for streaming") + return + + # Calculate chunk size in samples + chunk_size = int(generator.sample_rate * chunk_size_ms / 1000) + total_chunks = math.ceil(audio_tensor.size(0) / chunk_size) + + print(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each") + + # Send initial response with text but no audio yet + socketio.emit('audio_response_start', { + 'type': 'audio_response_start', + 'text': text, + 'total_chunks': total_chunks + }, room=client_id) + + # Stream each chunk + for i in range(total_chunks): + start_idx = i * chunk_size + end_idx = min(start_idx + chunk_size, audio_tensor.size(0)) + + # Extract chunk + chunk = audio_tensor[start_idx:end_idx] + + # Encode chunk + chunk_base64 = encode_audio_data(chunk) + + # Send chunk + socketio.emit('audio_response_chunk', { + 'type': 'audio_response_chunk', + 'chunk_index': i, + 'total_chunks': total_chunks, + 'audio': chunk_base64, + 'is_last': i == total_chunks - 1 + }, room=client_id) + + # Brief pause between chunks to simulate streaming + time.sleep(0.1) + + # Send completion message + socketio.emit('audio_response_complete', { + 'type': 'audio_response_complete', + 'text': text + }, room=client_id) + + print(f"Audio streaming complete: {total_chunks} chunks sent") + + except Exception as e: + print(f"Error streaming audio to client: {str(e)}") + import traceback + traceback.print_exc() + if __name__ == "__main__": print(f"\n{'='*60}") print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)") diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js index c85da8a..b224b27 100644 --- a/Backend/voice-chat.js +++ b/Backend/voice-chat.js @@ -466,37 +466,27 @@ function sendAudioChunk(audioData, speaker) { return; } - console.log(`Creating WAV from audio data: length=${audioData.length}`); + console.log(`Preparing audio chunk: length=${audioData.length}, speaker=${speaker}`); // Check for NaN or invalid values - let hasNaN = false; - let min = Infinity; - let max = -Infinity; - let sum = 0; - + let hasInvalidValues = false; for (let i = 0; i < audioData.length; i++) { if (isNaN(audioData[i]) || !isFinite(audioData[i])) { - hasNaN = true; + hasInvalidValues = true; console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`); break; } - min = Math.min(min, audioData[i]); - max = Math.max(max, audioData[i]); - sum += audioData[i]; } - if (hasNaN) { - console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.'); + if (hasInvalidValues) { + console.warn('Audio data contains invalid values. Creating silent audio.'); audioData = new Float32Array(audioData.length).fill(0); - } else { - const avg = sum / audioData.length; - console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`); } try { - // Create WAV blob with proper format + // Create WAV blob const wavData = createWavBlob(audioData, 24000); - console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`); + console.log(`WAV blob created: ${wavData.size} bytes`); const reader = new FileReader(); @@ -504,28 +494,21 @@ function sendAudioChunk(audioData, speaker) { try { // Get base64 data const base64data = reader.result; - console.log(`Base64 data created: length=${base64data.length}`); + console.log(`Base64 data created: ${base64data.length} bytes`); - // Validate the base64 data before sending - if (!base64data || base64data.length < 100) { - console.warn('Generated base64 data is too small or invalid'); - return; - } - - // Send the audio chunk to the server - console.log('Sending audio data to server...'); + // Send to server state.socket.emit('stream_audio', { audio: base64data, speaker: speaker }); - console.log('Audio data sent successfully'); + console.log('Audio chunk sent to server'); } catch (err) { console.error('Error preparing audio data:', err); } }; - reader.onerror = function(err) { - console.error('Error reading audio data:', err); + reader.onerror = function() { + console.error('Error reading audio data as base64'); }; reader.readAsDataURL(wavData); @@ -534,19 +517,20 @@ function sendAudioChunk(audioData, speaker) { } } -// Create WAV blob from audio data with validation +// Create WAV blob from audio data with improved error handling function createWavBlob(audioData, sampleRate) { - // Check if audio data is valid + // Validate input if (!audioData || audioData.length === 0) { - console.warn('Empty audio data received'); - // Return a tiny silent audio snippet instead - audioData = new Float32Array(100).fill(0); + console.warn('Empty audio data provided to createWavBlob'); + audioData = new Float32Array(1024).fill(0); // Create 1024 samples of silence } // Function to convert Float32Array to Int16Array for WAV format function floatTo16BitPCM(output, offset, input) { for (let i = 0; i < input.length; i++, offset += 2) { + // Ensure values are in -1 to 1 range const s = Math.max(-1, Math.min(1, input[i])); + // Convert to 16-bit PCM output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); } } @@ -558,40 +542,80 @@ function createWavBlob(audioData, sampleRate) { } } - // Create WAV file with header - function encodeWAV(samples) { - const buffer = new ArrayBuffer(44 + samples.length * 2); + try { + // Create WAV file with header - careful with buffer sizes + const buffer = new ArrayBuffer(44 + audioData.length * 2); const view = new DataView(buffer); - // RIFF chunk descriptor + // RIFF identifier writeString(view, 0, 'RIFF'); - view.setUint32(4, 36 + samples.length * 2, true); + + // File length (will be filled later) + view.setUint32(4, 36 + audioData.length * 2, true); + + // WAVE identifier writeString(view, 8, 'WAVE'); - // fmt sub-chunk + // fmt chunk identifier writeString(view, 12, 'fmt '); + + // fmt chunk length view.setUint32(16, 16, true); - view.setUint16(20, 1, true); // PCM format - view.setUint16(22, 1, true); // Mono channel + + // Sample format (1 is PCM) + view.setUint16(20, 1, true); + + // Mono channel + view.setUint16(22, 1, true); + + // Sample rate view.setUint32(24, sampleRate, true); - view.setUint32(28, sampleRate * 2, true); // Byte rate - view.setUint16(32, 2, true); // Block align - view.setUint16(34, 16, true); // Bits per sample - // data sub-chunk + // Byte rate (sample rate * block align) + view.setUint32(28, sampleRate * 2, true); + + // Block align (channels * bytes per sample) + view.setUint16(32, 2, true); + + // Bits per sample + view.setUint16(34, 16, true); + + // data chunk identifier writeString(view, 36, 'data'); - view.setUint32(40, samples.length * 2, true); - floatTo16BitPCM(view, 44, samples); - return buffer; + // data chunk length + view.setUint32(40, audioData.length * 2, true); + + // Write the PCM samples + floatTo16BitPCM(view, 44, audioData); + + // Create and return blob + return new Blob([view], { type: 'audio/wav' }); + } catch (err) { + console.error('Error in createWavBlob:', err); + + // Create a minimal valid WAV file with silence as fallback + const fallbackSamples = new Float32Array(1024).fill(0); + const fallbackBuffer = new ArrayBuffer(44 + fallbackSamples.length * 2); + const fallbackView = new DataView(fallbackBuffer); + + writeString(fallbackView, 0, 'RIFF'); + fallbackView.setUint32(4, 36 + fallbackSamples.length * 2, true); + writeString(fallbackView, 8, 'WAVE'); + writeString(fallbackView, 12, 'fmt '); + fallbackView.setUint32(16, 16, true); + fallbackView.setUint16(20, 1, true); + fallbackView.setUint16(22, 1, true); + fallbackView.setUint32(24, sampleRate, true); + fallbackView.setUint32(28, sampleRate * 2, true); + fallbackView.setUint16(32, 2, true); + fallbackView.setUint16(34, 16, true); + writeString(fallbackView, 36, 'data'); + fallbackView.setUint32(40, fallbackSamples.length * 2, true); + floatTo16BitPCM(fallbackView, 44, fallbackSamples); + + return new Blob([fallbackView], { type: 'audio/wav' }); } - - // Convert audio data to TypedArray if it's a regular Array - const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData; - - // Create WAV blob - const wavBuffer = encodeWAV(samples); - return new Blob([wavBuffer], { type: 'audio/wav' }); } // Draw audio visualizer From 230117a0225b9df857810defbcfa9487a3bf6755 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 00:14:47 -0400 Subject: [PATCH 12/16] Demo Update 4 --- Backend/server.py | 150 +++++++++++++++++++++++++++++++++------------- 1 file changed, 107 insertions(+), 43 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index b638e99..a6b70a3 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -16,56 +16,91 @@ import gc from collections import deque from threading import Lock -# Add these lines right after your imports -import torch -import os +# Add this at the top of your file, replacing your current CUDA setup -# Handle CUDA issues -os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only -torch.backends.cudnn.benchmark = True - -# Set CUDA settings to avoid TF32 warnings -torch.backends.cuda.matmul.allow_tf32 = True -torch.backends.cudnn.allow_tf32 = True - -# Set compute type based on available hardware -if torch.cuda.is_available(): - device = "cuda" - compute_type = "float16" # Faster for CUDA -else: +# CUDA setup with robust error handling +try: + # Handle CUDA issues + os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only + + # Try enabling TF32 precision + try: + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + except: + pass # Ignore if not supported + + # Check if CUDA is available + if torch.cuda.is_available(): + try: + # Test CUDA functionality + x = torch.rand(10, device="cuda") + y = x + x + del x, y + device = "cuda" + compute_type = "float16" + print("CUDA is fully functional") + except Exception as cuda_error: + print(f"CUDA is available but not working correctly: {str(cuda_error)}") + device = "cpu" + compute_type = "int8" + else: + device = "cpu" + compute_type = "int8" +except Exception as e: + print(f"Error setting up CUDA: {str(e)}") device = "cpu" - compute_type = "int8" # Better for CPU + compute_type = "int8" print(f"Using device: {device} with compute type: {compute_type}") -# Select device -if torch.cuda.is_available(): - device = "cuda" -else: - device = "cpu" -print(f"Using device: {device}") +# Initialize the Sesame CSM model with robust error handling +try: + print(f"Loading Sesame CSM model on {device}...") + generator = load_csm_1b(device=device) + print("Sesame CSM model loaded successfully") +except Exception as model_error: + print(f"Error loading Sesame CSM on {device}: {str(model_error)}") + if device == "cuda": + # Try on CPU as fallback + try: + print("Trying to load Sesame CSM on CPU instead...") + device = "cpu" # Update global device setting + generator = load_csm_1b(device="cpu") + print("Sesame CSM model loaded on CPU successfully") + except Exception as cpu_error: + print(f"Fatal error - could not load Sesame CSM model: {str(cpu_error)}") + raise RuntimeError("Failed to load speech synthesis model") + else: + # Already tried CPU and it failed + raise RuntimeError("Failed to load speech synthesis model on any device") -# Initialize the model -generator = load_csm_1b(device=device) - -# Initialize WhisperX for ASR +# Initialize WhisperX for ASR with robust error handling print("Loading WhisperX model...") try: - # Try to load a smaller model for faster response times - asr_model = whisperx.load_model("small", device, compute_type=compute_type) - print("WhisperX 'small' model loaded successfully") + # First try the smallest model ("tiny") to avoid memory issues + asr_model = whisperx.load_model("tiny", device, compute_type=compute_type) + print("WhisperX 'tiny' model loaded successfully") + + # If tiny worked and we have CUDA, try upgrading to small + if device == "cuda": + try: + asr_model = whisperx.load_model("small", device, compute_type=compute_type) + print("WhisperX 'small' model loaded successfully") + except Exception as upgrade_error: + print(f"Staying with 'tiny' model: {str(upgrade_error)}") except Exception as e: - print(f"Error loading 'small' model: {str(e)}") + print(f"Error loading models on {device}: {str(e)}") + print("Falling back to CPU model") try: - # Fall back to tiny model if small fails - asr_model = whisperx.load_model("tiny", device, compute_type=compute_type) - print("WhisperX 'tiny' model loaded as fallback") - except Exception as e2: - print(f"Error loading fallback model: {str(e2)}") - print("Trying CPU model as last resort") - # Last resort - try CPU + # Force CPU as last resort + device = "cpu" + compute_type = "int8" asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") print("WhisperX loaded on CPU as last resort") + except Exception as cpu_error: + print(f"Fatal error - could not load any model: {str(cpu_error)}") + raise RuntimeError("No ASR model could be loaded. Please check your CUDA installation.") # Silence detection parameters SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization @@ -226,7 +261,7 @@ def encode_audio_data(audio_tensor: torch.Tensor) -> str: def transcribe_audio(audio_tensor: torch.Tensor) -> str: - """Transcribe audio using WhisperX""" + """Transcribe audio using WhisperX with robust error handling""" try: # Save the tensor to a temporary file temp_path = os.path.join(base_dir, "temp_audio.wav") @@ -234,9 +269,38 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str: print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)") - # Load and transcribe the audio - audio = whisperx.load_audio(temp_path) - result = asr_model.transcribe(audio, batch_size=16) + # Load the audio file using whisperx's function + try: + audio = whisperx.load_audio(temp_path) + except Exception as audio_load_error: + print(f"WhisperX load_audio failed: {str(audio_load_error)}") + # Fall back to manual loading + import soundfile as sf + audio, sr = sf.read(temp_path) + if sr != 16000: # WhisperX expects 16kHz audio + from scipy import signal + audio = signal.resample(audio, int(len(audio) * 16000 / sr)) + + # Transcribe with error handling for CUDA issues + try: + # Try with original device + result = asr_model.transcribe(audio, batch_size=8) + except RuntimeError as cuda_error: + if "CUDA" in str(cuda_error) or "libcudnn" in str(cuda_error): + print(f"CUDA error in transcription, falling back to CPU: {str(cuda_error)}") + + # Try to load a CPU model as fallback + try: + global asr_model + # Move model to CPU and try again + asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") + result = asr_model.transcribe(audio, batch_size=1) + except Exception as e: + print(f"CPU fallback also failed: {str(e)}") + return "I'm having trouble processing audio right now." + else: + # Re-raise if it's not a CUDA error + raise # Clean up if os.path.exists(temp_path): @@ -257,7 +321,7 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str: traceback.print_exc() if os.path.exists("temp_audio.wav"): os.remove("temp_audio.wav") - return "" + return "I heard something but couldn't understand it." def generate_response(text: str, conversation_history: List[Segment]) -> str: From bb5e0c4765f010d4bd313d1b4d7198e43c764ac5 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 00:17:39 -0400 Subject: [PATCH 13/16] Demo Fixes 1 --- Backend/server.py | 60 +++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index a6b70a3..d0dee80 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -75,32 +75,51 @@ except Exception as model_error: # Already tried CPU and it failed raise RuntimeError("Failed to load speech synthesis model on any device") +# Replace the WhisperX model loading section + # Initialize WhisperX for ASR with robust error handling print("Loading WhisperX model...") +asr_model = None # Initialize to None first to avoid scope issues + try: - # First try the smallest model ("tiny") to avoid memory issues - asr_model = whisperx.load_model("tiny", device, compute_type=compute_type) - print("WhisperX 'tiny' model loaded successfully") + # Always start with the tiny model on CPU for stability + asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") + print("WhisperX 'tiny' model loaded on CPU successfully") - # If tiny worked and we have CUDA, try upgrading to small + # If CPU works, try CUDA if available if device == "cuda": try: - asr_model = whisperx.load_model("small", device, compute_type=compute_type) - print("WhisperX 'small' model loaded successfully") - except Exception as upgrade_error: - print(f"Staying with 'tiny' model: {str(upgrade_error)}") + print("Trying to load WhisperX on CUDA...") + cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16") + # Test the model to ensure it works + test_audio = torch.zeros(16000) # 1 second of silence at 16kHz + _ = cuda_model.transcribe(test_audio.numpy(), batch_size=1) + # If we get here, CUDA works + asr_model = cuda_model + print("WhisperX model moved to CUDA successfully") + + # Try to upgrade to small model on CUDA + try: + small_model = whisperx.load_model("small", "cuda", compute_type="float16") + # Test it + _ = small_model.transcribe(test_audio.numpy(), batch_size=1) + asr_model = small_model + print("WhisperX 'small' model loaded on CUDA successfully") + except Exception as upgrade_error: + print(f"Staying with 'tiny' model on CUDA: {str(upgrade_error)}") + except Exception as cuda_error: + print(f"CUDA loading failed, staying with CPU model: {str(cuda_error)}") except Exception as e: - print(f"Error loading models on {device}: {str(e)}") - print("Falling back to CPU model") - try: - # Force CPU as last resort - device = "cpu" - compute_type = "int8" - asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") - print("WhisperX loaded on CPU as last resort") - except Exception as cpu_error: - print(f"Fatal error - could not load any model: {str(cpu_error)}") - raise RuntimeError("No ASR model could be loaded. Please check your CUDA installation.") + print(f"Error loading WhisperX model: {str(e)}") + # Create a minimal dummy model as last resort + class DummyModel: + def __init__(self): + self.device = "cpu" + def transcribe(self, *args, **kwargs): + return {"segments": [{"text": "Speech recognition currently unavailable."}]} + + asr_model = DummyModel() + print("WARNING: Using dummy transcription model - ASR functionality limited") # Silence detection parameters SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization @@ -262,6 +281,8 @@ def encode_audio_data(audio_tensor: torch.Tensor) -> str: def transcribe_audio(audio_tensor: torch.Tensor) -> str: """Transcribe audio using WhisperX with robust error handling""" + global asr_model # Declare global at the beginning of the function + try: # Save the tensor to a temporary file temp_path = os.path.join(base_dir, "temp_audio.wav") @@ -291,7 +312,6 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str: # Try to load a CPU model as fallback try: - global asr_model # Move model to CPU and try again asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") result = asr_model.transcribe(audio, batch_size=1) From adbc3c89d69a04f542955ccb2bf963b2dc4b366a Mon Sep 17 00:00:00 2001 From: Surya Vemulapalli Date: Sun, 30 Mar 2025 00:20:15 -0400 Subject: [PATCH 14/16] Added mongoose --- React/bun.lock | 39 +++++++++++++++++++++++++ React/package.json | 1 + React/src/app/page.tsx | 66 +++++++++++++++++++++++++++++++++++------- 3 files changed, 96 insertions(+), 10 deletions(-) diff --git a/React/bun.lock b/React/bun.lock index dca1020..4b4fd3e 100644 --- a/React/bun.lock +++ b/React/bun.lock @@ -5,6 +5,7 @@ "name": "my-app", "dependencies": { "@auth0/nextjs-auth0": "^4.3.0", + "mongoose": "^8.13.1", "next": "15.2.4", "react": "^19.1.0", "react-dom": "^19.1.0", @@ -68,6 +69,8 @@ "@img/sharp-win32-x64": ["@img/sharp-win32-x64@0.33.5", "", { "os": "win32", "cpu": "x64" }, "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg=="], + "@mongodb-js/saslprep": ["@mongodb-js/saslprep@1.2.0", "", { "dependencies": { "sparse-bitfield": "^3.0.3" } }, "sha512-+ywrb0AqkfaYuhHs6LxKWgqbh3I72EpEgESCw37o+9qPx9WTCkgDm2B+eMrwehGtHBWHFU4GXvnSCNiFhhausg=="], + "@next/env": ["@next/env@15.2.4", "", {}, "sha512-+SFtMgoiYP3WoSswuNmxJOCwi06TdWE733D+WPjpXIe4LXGULwEaofiiAy6kbS0+XjM5xF5n3lKuBwN2SnqD9g=="], "@next/swc-darwin-arm64": ["@next/swc-darwin-arm64@15.2.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-1AnMfs655ipJEDC/FHkSr0r3lXBgpqKo4K1kiwfUf3iE68rDFXZ1TtHdMvf7D0hMItgDZ7Vuq3JgNMbt/+3bYw=="], @@ -130,6 +133,10 @@ "@types/react-dom": ["@types/react-dom@19.0.4", "", { "peerDependencies": { "@types/react": "^19.0.0" } }, "sha512-4fSQ8vWFkg+TGhePfUzVmat3eC14TXYSsiiDSLI0dVLsrm9gZFABjPy/Qu6TKgl1tq1Bu1yDsuQgY3A3DOjCcg=="], + "@types/webidl-conversions": ["@types/webidl-conversions@7.0.3", "", {}, "sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA=="], + + "@types/whatwg-url": ["@types/whatwg-url@11.0.5", "", { "dependencies": { "@types/webidl-conversions": "*" } }, "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ=="], + "@zag-js/accordion": ["@zag-js/accordion@1.7.0", "", { "dependencies": { "@zag-js/anatomy": "1.7.0", "@zag-js/core": "1.7.0", "@zag-js/dom-query": "1.7.0", "@zag-js/types": "1.7.0", "@zag-js/utils": "1.7.0" } }, "sha512-LNJOjLTW2KwrToXBrXIbNIAiISA94n0AdWp14H8RrskdokywmEGiC0GgWTGEJ7DNA6TGP6Ae5o9rJ4fHSmCsDQ=="], "@zag-js/anatomy": ["@zag-js/anatomy@1.7.0", "", {}, "sha512-fkRgH6vPCwykmRdV38uAJeTtJc8tayAnURfoovHAtB9bK0goagPbpdcYTNyGn8msul0h+KBloOtnw4obvX0nPw=="], @@ -182,6 +189,8 @@ "@zag-js/utils": ["@zag-js/utils@1.7.0", "", {}, "sha512-yIxvH5V27a1WuLgCxHX7qpdtFo8vTJaZLafBpSNfVYG4B8FaxTE+P7JAcpmAzs3UyXura/WfAY2eVWWVBpk9ZA=="], + "bson": ["bson@6.10.3", "", {}, "sha512-MTxGsqgYTwfshYWTRdmZRC+M7FnG1b4y7RO7p2k3X24Wq0yv1m77Wsj0BzlPzd/IowgESfsruQCUToa7vbOpPQ=="], + "busboy": ["busboy@1.6.0", "", { "dependencies": { "streamsearch": "^1.1.0" } }, "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA=="], "caniuse-lite": ["caniuse-lite@1.0.30001707", "", {}, "sha512-3qtRjw/HQSMlDWf+X79N206fepf4SOOU6SQLMaq/0KkZLmSjPxAkBOQQ+FxbHKfHmYLZFfdWsO3KA90ceHPSnw=="], @@ -198,6 +207,8 @@ "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="], + "debug": ["debug@4.4.0", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA=="], + "dequal": ["dequal@2.0.3", "", {}, "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA=="], "detect-libc": ["detect-libc@2.0.3", "", {}, "sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw=="], @@ -212,6 +223,8 @@ "jose": ["jose@5.10.0", "", {}, "sha512-s+3Al/p9g32Iq+oqXxkW//7jk2Vig6FF1CFqzVXoTUXt2qz89YWbL+OwS17NFYEvxC35n0FKeGO2LGYSxeM2Gg=="], + "kareem": ["kareem@2.6.3", "", {}, "sha512-C3iHfuGUXK2u8/ipq9LfjFfXFxAZMQJJq7vLS45r3D9Y2xQ/m4S8zaR4zMLFWh9AsNPXmcFfUDhTEO8UIC/V6Q=="], + "lightningcss": ["lightningcss@1.29.2", "", { "dependencies": { "detect-libc": "^2.0.3" }, "optionalDependencies": { "lightningcss-darwin-arm64": "1.29.2", "lightningcss-darwin-x64": "1.29.2", "lightningcss-freebsd-x64": "1.29.2", "lightningcss-linux-arm-gnueabihf": "1.29.2", "lightningcss-linux-arm64-gnu": "1.29.2", "lightningcss-linux-arm64-musl": "1.29.2", "lightningcss-linux-x64-gnu": "1.29.2", "lightningcss-linux-x64-musl": "1.29.2", "lightningcss-win32-arm64-msvc": "1.29.2", "lightningcss-win32-x64-msvc": "1.29.2" } }, "sha512-6b6gd/RUXKaw5keVdSEtqFVdzWnU5jMxTUjA2bVcMNPLwSQ08Sv/UodBVtETLCn7k4S1Ibxwh7k68IwLZPgKaA=="], "lightningcss-darwin-arm64": ["lightningcss-darwin-arm64@1.29.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-cK/eMabSViKn/PG8U/a7aCorpeKLMlK0bQeNHmdb7qUnBkNPnL+oV5DjJUo0kqWsJUapZsM4jCfYItbqBDvlcA=="], @@ -234,6 +247,20 @@ "lightningcss-win32-x64-msvc": ["lightningcss-win32-x64-msvc@1.29.2", "", { "os": "win32", "cpu": "x64" }, "sha512-EdIUW3B2vLuHmv7urfzMI/h2fmlnOQBk1xlsDxkN1tCWKjNFjfLhGxYk8C8mzpSfr+A6jFFIi8fU6LbQGsRWjA=="], + "memory-pager": ["memory-pager@1.5.0", "", {}, "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg=="], + + "mongodb": ["mongodb@6.15.0", "", { "dependencies": { "@mongodb-js/saslprep": "^1.1.9", "bson": "^6.10.3", "mongodb-connection-string-url": "^3.0.0" }, "peerDependencies": { "@aws-sdk/credential-providers": "^3.188.0", "@mongodb-js/zstd": "^1.1.0 || ^2.0.0", "gcp-metadata": "^5.2.0", "kerberos": "^2.0.1", "mongodb-client-encryption": ">=6.0.0 <7", "snappy": "^7.2.2", "socks": "^2.7.1" }, "optionalPeers": ["@aws-sdk/credential-providers", "@mongodb-js/zstd", "gcp-metadata", "kerberos", "mongodb-client-encryption", "snappy", "socks"] }, "sha512-ifBhQ0rRzHDzqp9jAQP6OwHSH7dbYIQjD3SbJs9YYk9AikKEettW/9s/tbSFDTpXcRbF+u1aLrhHxDFaYtZpFQ=="], + + "mongodb-connection-string-url": ["mongodb-connection-string-url@3.0.2", "", { "dependencies": { "@types/whatwg-url": "^11.0.2", "whatwg-url": "^14.1.0 || ^13.0.0" } }, "sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA=="], + + "mongoose": ["mongoose@8.13.1", "", { "dependencies": { "bson": "^6.10.3", "kareem": "2.6.3", "mongodb": "~6.15.0", "mpath": "0.9.0", "mquery": "5.0.0", "ms": "2.1.3", "sift": "17.1.3" } }, "sha512-sRqlXI+6jhr9/KicCOjet1VVPONFsOxTrh14tfueX5y3GJ2ihswc5ewUUojuwdSS/5koGXLIPmGivDSApVXflA=="], + + "mpath": ["mpath@0.9.0", "", {}, "sha512-ikJRQTk8hw5DEoFVxHG1Gn9T/xcjtdnOKIU1JTmGjZZlg9LST2mBLmcX3/ICIbgJydT2GOc15RnNy5mHmzfSew=="], + + "mquery": ["mquery@5.0.0", "", { "dependencies": { "debug": "4.x" } }, "sha512-iQMncpmEK8R8ncT8HJGsGc9Dsp8xcgYMVSbs5jgnm1lFHTZqMJTUWTDx1LBO8+mK3tPNZWFLBghQEIOULSTHZg=="], + + "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + "nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="], "next": ["next@15.2.4", "", { "dependencies": { "@next/env": "15.2.4", "@swc/counter": "0.1.3", "@swc/helpers": "0.5.15", "busboy": "1.6.0", "caniuse-lite": "^1.0.30001579", "postcss": "8.4.31", "styled-jsx": "5.1.6" }, "optionalDependencies": { "@next/swc-darwin-arm64": "15.2.4", "@next/swc-darwin-x64": "15.2.4", "@next/swc-linux-arm64-gnu": "15.2.4", "@next/swc-linux-arm64-musl": "15.2.4", "@next/swc-linux-x64-gnu": "15.2.4", "@next/swc-linux-x64-musl": "15.2.4", "@next/swc-win32-arm64-msvc": "15.2.4", "@next/swc-win32-x64-msvc": "15.2.4", "sharp": "^0.33.5" }, "peerDependencies": { "@opentelemetry/api": "^1.1.0", "@playwright/test": "^1.41.2", "babel-plugin-react-compiler": "*", "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "sass": "^1.3.0" }, "optionalPeers": ["@opentelemetry/api", "@playwright/test", "babel-plugin-react-compiler", "sass"], "bin": { "next": "dist/bin/next" } }, "sha512-VwL+LAaPSxEkd3lU2xWbgEOtrM8oedmyhBqaVNmgKB+GvZlCy9rgaEc+y2on0wv+l0oSFqLtYD6dcC1eAedUaQ=="], @@ -246,6 +273,8 @@ "proxy-compare": ["proxy-compare@3.0.1", "", {}, "sha512-V9plBAt3qjMlS1+nC8771KNf6oJ12gExvaxnNzN/9yVRLdTv/lc+oJlnSzrdYDAvBfTStPCoiaCOTmTs0adv7Q=="], + "punycode": ["punycode@2.3.1", "", {}, "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg=="], + "react": ["react@19.1.0", "", {}, "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg=="], "react-dom": ["react-dom@19.1.0", "", { "dependencies": { "scheduler": "^0.26.0" }, "peerDependencies": { "react": "^19.1.0" } }, "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g=="], @@ -256,10 +285,14 @@ "sharp": ["sharp@0.33.5", "", { "dependencies": { "color": "^4.2.3", "detect-libc": "^2.0.3", "semver": "^7.6.3" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "0.33.5", "@img/sharp-darwin-x64": "0.33.5", "@img/sharp-libvips-darwin-arm64": "1.0.4", "@img/sharp-libvips-darwin-x64": "1.0.4", "@img/sharp-libvips-linux-arm": "1.0.5", "@img/sharp-libvips-linux-arm64": "1.0.4", "@img/sharp-libvips-linux-s390x": "1.0.4", "@img/sharp-libvips-linux-x64": "1.0.4", "@img/sharp-libvips-linuxmusl-arm64": "1.0.4", "@img/sharp-libvips-linuxmusl-x64": "1.0.4", "@img/sharp-linux-arm": "0.33.5", "@img/sharp-linux-arm64": "0.33.5", "@img/sharp-linux-s390x": "0.33.5", "@img/sharp-linux-x64": "0.33.5", "@img/sharp-linuxmusl-arm64": "0.33.5", "@img/sharp-linuxmusl-x64": "0.33.5", "@img/sharp-wasm32": "0.33.5", "@img/sharp-win32-ia32": "0.33.5", "@img/sharp-win32-x64": "0.33.5" } }, "sha512-haPVm1EkS9pgvHrQ/F3Xy+hgcuMV0Wm9vfIBSiwZ05k+xgb0PkBQpGsAA/oWdDobNaZTH5ppvHtzCFbnSEwHVw=="], + "sift": ["sift@17.1.3", "", {}, "sha512-Rtlj66/b0ICeFzYTuNvX/EF1igRbbnGSvEyT79McoZa/DeGhMyC5pWKOEsZKnpkqtSeovd5FL/bjHWC3CIIvCQ=="], + "simple-swizzle": ["simple-swizzle@0.2.2", "", { "dependencies": { "is-arrayish": "^0.3.1" } }, "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg=="], "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="], + "sparse-bitfield": ["sparse-bitfield@3.0.3", "", { "dependencies": { "memory-pager": "^1.0.2" } }, "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ=="], + "streamsearch": ["streamsearch@1.1.0", "", {}, "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg=="], "styled-jsx": ["styled-jsx@5.1.6", "", { "dependencies": { "client-only": "0.0.1" }, "peerDependencies": { "react": ">= 16.8.0 || 17.x.x || ^18.0.0-0 || ^19.0.0-0" } }, "sha512-qSVyDTeMotdvQYoHWLNGwRFJHC+i+ZvdBRYosOFgC+Wg1vx4frN2/RG/NA7SYqqvKNLf39P2LSRA2pu6n0XYZA=="], @@ -270,6 +303,8 @@ "tapable": ["tapable@2.2.1", "", {}, "sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ=="], + "tr46": ["tr46@5.1.0", "", { "dependencies": { "punycode": "^2.3.1" } }, "sha512-IUWnUK7ADYR5Sl1fZlO1INDUhVhatWl7BtJWsIhwJ0UAK7ilzzIa8uIqOO/aYVWHZPJkKbEL+362wrzoeRF7bw=="], + "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], "typescript": ["typescript@5.8.2", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ=="], @@ -278,6 +313,10 @@ "use-sync-external-store": ["use-sync-external-store@1.5.0", "", { "peerDependencies": { "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A=="], + "webidl-conversions": ["webidl-conversions@7.0.0", "", {}, "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g=="], + + "whatwg-url": ["whatwg-url@14.2.0", "", { "dependencies": { "tr46": "^5.1.0", "webidl-conversions": "^7.0.0" } }, "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw=="], + "next/postcss": ["postcss@8.4.31", "", { "dependencies": { "nanoid": "^3.3.6", "picocolors": "^1.0.0", "source-map-js": "^1.0.2" } }, "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ=="], } } diff --git a/React/package.json b/React/package.json index a3f4b26..0d75192 100644 --- a/React/package.json +++ b/React/package.json @@ -10,6 +10,7 @@ }, "dependencies": { "@auth0/nextjs-auth0": "^4.3.0", + "mongoose": "^8.13.1", "next": "15.2.4", "react": "^19.1.0", "react-dom": "^19.1.0" diff --git a/React/src/app/page.tsx b/React/src/app/page.tsx index fcd37b0..2a20b5d 100644 --- a/React/src/app/page.tsx +++ b/React/src/app/page.tsx @@ -40,14 +40,29 @@ export default async function Home() { type="submit">Set codeword {/* form for adding contacts */} -
e.preventDefault()}> + e.preventDefault()}> setContacts(e.target.value.split(","))} - placeholder="contacts (comma separated)" + placeholder="Write down an emergency contact" className="border border-gray-300 rounded-md p-2" /> + setContacts(e.target.value.split(","))} + placeholder="Write down an emergency contact" + className="border border-gray-300 rounded-md p-2" + /> + setContacts(e.target.value.split(","))} + placeholder="Write down an emergency contact" + className="border border-gray-300 rounded-md p-2" + /> +
@@ -76,14 +91,45 @@ export default async function Home() { type="submit">Set codeword {/* form for adding contacts */} -
e.preventDefault()}> - setContacts(e.target.value.split(","))} - placeholder="contacts (comma separated)" - className="border border-gray-300 rounded-md p-2" - /> + e.preventDefault()}> + setContacts(e.target.value.split(","))} + placeholder="Write down an emergency contact" + className="border border-gray-300 rounded-md p-2" + /> + setContacts(e.target.value.split(","))} + placeholder="Write down an emergency contact" + className="border border-gray-300 rounded-md p-2" + /> + setContacts(e.target.value.split(","))} + placeholder="Write down an emergency contact" + className="border border-gray-300 rounded-md p-2" + /> + setContacts(e.target.value.split(","))} + placeholder="Write down an emergency contact" + className="text-input border border-gray-300 rounded-md p-2" + /> + +
From 6152e300c000793d3f5682dda2ac1431fc03a12e Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 00:24:26 -0400 Subject: [PATCH 15/16] Demo Update 6 --- Backend/server.py | 659 +++++++++++++++++++++++----------------------- 1 file changed, 335 insertions(+), 324 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index d0dee80..8ba56b4 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -1,9 +1,13 @@ import os import base64 import json +import time +import math +import gc +import logging +import numpy as np import torch import torchaudio -import numpy as np import whisperx from io import BytesIO from typing import List, Dict, Any, Optional @@ -11,290 +15,314 @@ from flask import Flask, request, send_from_directory, Response from flask_cors import CORS from flask_socketio import SocketIO, emit, disconnect from generator import load_csm_1b, Segment -import time -import gc from collections import deque from threading import Lock -# Add this at the top of your file, replacing your current CUDA setup +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger("sesame-server") -# CUDA setup with robust error handling -try: - # Handle CUDA issues - os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only +# CUDA Environment Setup +def setup_cuda_environment(): + """Set up CUDA environment with proper error handling""" + # Search for CUDA libraries in common locations + cuda_lib_dirs = [ + "/usr/local/cuda/lib64", + "/usr/lib/x86_64-linux-gnu", + "/usr/local/cuda/extras/CUPTI/lib64" + ] - # Try enabling TF32 precision - try: - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.allow_tf32 = True - except: - pass # Ignore if not supported + # Add directories to LD_LIBRARY_PATH if they exist + current_ld_path = os.environ.get('LD_LIBRARY_PATH', '') + for cuda_dir in cuda_lib_dirs: + if os.path.exists(cuda_dir) and cuda_dir not in current_ld_path: + if current_ld_path: + os.environ['LD_LIBRARY_PATH'] = f"{current_ld_path}:{cuda_dir}" + else: + os.environ['LD_LIBRARY_PATH'] = cuda_dir + current_ld_path = os.environ['LD_LIBRARY_PATH'] - # Check if CUDA is available - if torch.cuda.is_available(): - try: - # Test CUDA functionality - x = torch.rand(10, device="cuda") - y = x + x - del x, y - device = "cuda" - compute_type = "float16" - print("CUDA is fully functional") - except Exception as cuda_error: - print(f"CUDA is available but not working correctly: {str(cuda_error)}") - device = "cpu" - compute_type = "int8" - else: - device = "cpu" - compute_type = "int8" -except Exception as e: - print(f"Error setting up CUDA: {str(e)}") + logger.info(f"LD_LIBRARY_PATH set to: {os.environ.get('LD_LIBRARY_PATH', 'not set')}") + + # Determine best compute device device = "cpu" compute_type = "int8" - -print(f"Using device: {device} with compute type: {compute_type}") - -# Initialize the Sesame CSM model with robust error handling -try: - print(f"Loading Sesame CSM model on {device}...") - generator = load_csm_1b(device=device) - print("Sesame CSM model loaded successfully") -except Exception as model_error: - print(f"Error loading Sesame CSM on {device}: {str(model_error)}") - if device == "cuda": - # Try on CPU as fallback - try: - print("Trying to load Sesame CSM on CPU instead...") - device = "cpu" # Update global device setting - generator = load_csm_1b(device="cpu") - print("Sesame CSM model loaded on CPU successfully") - except Exception as cpu_error: - print(f"Fatal error - could not load Sesame CSM model: {str(cpu_error)}") - raise RuntimeError("Failed to load speech synthesis model") - else: - # Already tried CPU and it failed - raise RuntimeError("Failed to load speech synthesis model on any device") - -# Replace the WhisperX model loading section - -# Initialize WhisperX for ASR with robust error handling -print("Loading WhisperX model...") -asr_model = None # Initialize to None first to avoid scope issues - -try: - # Always start with the tiny model on CPU for stability - asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") - print("WhisperX 'tiny' model loaded on CPU successfully") - # If CPU works, try CUDA if available - if device == "cuda": + try: + # Set CUDA preferences + os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only + + # Try enabling TF32 precision if available try: - print("Trying to load WhisperX on CUDA...") - cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16") - # Test the model to ensure it works - test_audio = torch.zeros(16000) # 1 second of silence at 16kHz - _ = cuda_model.transcribe(test_audio.numpy(), batch_size=1) - # If we get here, CUDA works - asr_model = cuda_model - print("WhisperX model moved to CUDA successfully") - - # Try to upgrade to small model on CUDA + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = True + except Exception as e: + logger.warning(f"Could not set advanced CUDA options: {e}") + + # Test if CUDA is functional + if torch.cuda.is_available(): try: - small_model = whisperx.load_model("small", "cuda", compute_type="float16") - # Test it - _ = small_model.transcribe(test_audio.numpy(), batch_size=1) - asr_model = small_model - print("WhisperX 'small' model loaded on CUDA successfully") - except Exception as upgrade_error: - print(f"Staying with 'tiny' model on CUDA: {str(upgrade_error)}") - except Exception as cuda_error: - print(f"CUDA loading failed, staying with CPU model: {str(cuda_error)}") -except Exception as e: - print(f"Error loading WhisperX model: {str(e)}") - # Create a minimal dummy model as last resort - class DummyModel: - def __init__(self): - self.device = "cpu" - def transcribe(self, *args, **kwargs): - return {"segments": [{"text": "Speech recognition currently unavailable."}]} + # Test basic CUDA operations + x = torch.rand(10, device="cuda") + y = x + x + del x, y + torch.cuda.empty_cache() + device = "cuda" + compute_type = "float16" + logger.info("CUDA is fully functional") + except Exception as e: + logger.warning(f"CUDA available but not working correctly: {e}") + device = "cpu" + else: + logger.info("CUDA is not available, using CPU") + except Exception as e: + logger.error(f"Error setting up computing environment: {e}") - asr_model = DummyModel() - print("WARNING: Using dummy transcription model - ASR functionality limited") + return device, compute_type -# Silence detection parameters -SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization -SILENCE_DURATION_SEC = 1.0 # How long silence must persist +# Set up the compute environment +device, compute_type = setup_cuda_environment() -# Define the base directory +# Constants and Configuration +SILENCE_THRESHOLD = 0.01 +SILENCE_DURATION_SEC = 0.75 +MAX_BUFFER_SIZE = 30 # Maximum chunks to buffer before processing +CHUNK_SIZE_MS = 500 # Size of audio chunks when streaming responses + +# Define the base directory and static files directory base_dir = os.path.dirname(os.path.abspath(__file__)) static_dir = os.path.join(base_dir, "static") os.makedirs(static_dir, exist_ok=True) -# Setup Flask +# Model Loading Functions +def load_speech_models(): + """Load all required speech models with fallbacks""" + # Load speech generation model (Sesame CSM) + try: + logger.info(f"Loading Sesame CSM model on {device}...") + generator = load_csm_1b(device=device) + logger.info("Sesame CSM model loaded successfully") + except Exception as e: + logger.error(f"Error loading Sesame CSM on {device}: {e}") + if device == "cuda": + try: + logger.info("Trying to load Sesame CSM on CPU instead...") + generator = load_csm_1b(device="cpu") + logger.info("Sesame CSM model loaded on CPU successfully") + except Exception as cpu_error: + logger.critical(f"Failed to load speech synthesis model: {cpu_error}") + raise RuntimeError("Failed to load speech synthesis model") + else: + raise RuntimeError("Failed to load speech synthesis model on any device") + + # Load ASR model (WhisperX) + try: + logger.info("Loading WhisperX model...") + # Start with the tiny model on CPU for reliable initialization + asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") + logger.info("WhisperX 'tiny' model loaded on CPU successfully") + + # Try upgrading to GPU if available + if device == "cuda": + try: + logger.info("Trying to load WhisperX on CUDA...") + # Test with a tiny model first + test_audio = torch.zeros(16000) # 1 second of silence + + cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16") + # Test the model with real inference + _ = cuda_model.transcribe(test_audio.numpy(), batch_size=1) + asr_model = cuda_model + logger.info("WhisperX model running on CUDA successfully") + + # Try to upgrade to small model + try: + small_model = whisperx.load_model("small", "cuda", compute_type="float16") + _ = small_model.transcribe(test_audio.numpy(), batch_size=1) + asr_model = small_model + logger.info("WhisperX 'small' model loaded on CUDA successfully") + except Exception as e: + logger.warning(f"Staying with 'tiny' model on CUDA: {e}") + except Exception as e: + logger.warning(f"CUDA loading failed, staying with CPU model: {e}") + except Exception as e: + logger.error(f"Error loading WhisperX model: {e}") + # Create a minimal dummy model as last resort + class DummyModel: + def __init__(self): + self.device = "cpu" + def transcribe(self, *args, **kwargs): + return {"segments": [{"text": "Speech recognition currently unavailable."}]} + + asr_model = DummyModel() + logger.warning("Using dummy transcription model - ASR functionality limited") + + return generator, asr_model + +# Load speech models +generator, asr_model = load_speech_models() + +# Set up Flask and Socket.IO app = Flask(__name__) CORS(app) socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') # Socket connection management -thread = None thread_lock = Lock() active_clients = {} # Map client_id to client context -# Helper function to convert audio data +# Audio Utility Functions def decode_audio_data(audio_data: str) -> torch.Tensor: """Decode base64 audio data to a torch tensor with improved error handling""" try: # Skip empty audio data if not audio_data or len(audio_data) < 100: - print("Empty or too short audio data received") + logger.warning("Empty or too short audio data received") return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence # Extract the actual base64 content if ',' in audio_data: - # Handle data URL format (data:audio/wav;base64,...) audio_data = audio_data.split(',')[1] # Decode base64 audio data try: binary_data = base64.b64decode(audio_data) - print(f"Decoded base64 data: {len(binary_data)} bytes") + logger.debug(f"Decoded base64 data: {len(binary_data)} bytes") # Check if we have enough data for a valid WAV if len(binary_data) < 44: # WAV header is 44 bytes - print("Data too small to be a valid WAV file") + logger.warning("Data too small to be a valid WAV file") return torch.zeros(generator.sample_rate // 2) except Exception as e: - print(f"Base64 decoding error: {str(e)}") + logger.error(f"Base64 decoding error: {e}") return torch.zeros(generator.sample_rate // 2) - # Save for debugging - debug_path = os.path.join(base_dir, "debug_incoming.wav") - with open(debug_path, 'wb') as f: - f.write(binary_data) - print(f"Saved debug file: {debug_path}") + # Multiple approaches to handle audio data + audio_tensor = None + sample_rate = None - # Approach 1: Load directly with torchaudio + # Approach 1: Direct loading with torchaudio try: with BytesIO(binary_data) as temp_file: - temp_file.seek(0) # Ensure we're at the start of the buffer + temp_file.seek(0) audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") - print(f"Direct loading success: shape={audio_tensor.shape}, rate={sample_rate}Hz") + logger.debug(f"Loaded audio: shape={audio_tensor.shape}, rate={sample_rate}Hz") - # Check if audio is valid + # Validate tensor if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any(): - raise ValueError("Empty or invalid audio tensor detected") + raise ValueError("Invalid audio tensor") except Exception as e: - print(f"Direct loading failed: {str(e)}") + logger.warning(f"Direct loading failed: {e}") - # Approach 2: Try to fix/normalize the WAV data + # Approach 2: Using wave module and numpy try: - # Sometimes WAV headers can be malformed, attempt to fix - temp_path = os.path.join(base_dir, "temp_fixing.wav") + temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav") with open(temp_path, 'wb') as f: f.write(binary_data) - # Use a simpler numpy approach as backup - import numpy as np import wave + with wave.open(temp_path, 'rb') as wf: + n_channels = wf.getnchannels() + sample_width = wf.getsampwidth() + sample_rate = wf.getframerate() + n_frames = wf.getnframes() + frames = wf.readframes(n_frames) + + # Convert to numpy array + if sample_width == 2: # 16-bit audio + data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0 + elif sample_width == 1: # 8-bit audio + data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0 + else: + raise ValueError(f"Unsupported sample width: {sample_width}") + + # Convert to mono if needed + if n_channels > 1: + data = data.reshape(-1, n_channels) + data = data.mean(axis=1) + + # Convert to torch tensor + audio_tensor = torch.from_numpy(data) + logger.info(f"Loaded audio using wave: shape={audio_tensor.shape}") - try: - with wave.open(temp_path, 'rb') as wf: - n_channels = wf.getnchannels() - sample_width = wf.getsampwidth() - sample_rate = wf.getframerate() - n_frames = wf.getnframes() - - # Read the frames - frames = wf.readframes(n_frames) - print(f"Wave reading: channels={n_channels}, rate={sample_rate}Hz, frames={n_frames}") - - # Convert to numpy and then to torch - if sample_width == 2: # 16-bit audio - data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0 - elif sample_width == 1: # 8-bit audio - data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0 - else: - raise ValueError(f"Unsupported sample width: {sample_width}") - - # Convert to mono if needed - if n_channels > 1: - data = data.reshape(-1, n_channels) - data = data.mean(axis=1) - - # Convert to torch tensor - audio_tensor = torch.from_numpy(data) - print(f"Successfully converted with numpy: shape={audio_tensor.shape}") - except Exception as wave_error: - print(f"Wave processing failed: {str(wave_error)}") - # Try with torchaudio as last resort - audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav") - - # Clean up + # Clean up temp file if os.path.exists(temp_path): os.remove(temp_path) + except Exception as e2: - print(f"All WAV loading methods failed: {str(e2)}") - print("Returning silence as fallback") + logger.error(f"All audio loading methods failed: {e2}") return torch.zeros(generator.sample_rate // 2) - # Ensure audio is the right shape (mono) + # Format corrections + if audio_tensor is None: + return torch.zeros(generator.sample_rate // 2) + + # Ensure audio is mono if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1: audio_tensor = torch.mean(audio_tensor, dim=0) - # Ensure we have a 1D tensor + # Ensure 1D tensor audio_tensor = audio_tensor.squeeze() # Resample if needed if sample_rate != generator.sample_rate: try: - print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz") + logger.debug(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz") resampler = torchaudio.transforms.Resample( orig_freq=sample_rate, new_freq=generator.sample_rate ) audio_tensor = resampler(audio_tensor) except Exception as e: - print(f"Resampling error: {str(e)}") - # If resampling fails, just return the original audio - # The model can often handle different sample rates + logger.warning(f"Resampling error: {e}") # Normalize audio to avoid issues if torch.abs(audio_tensor).max() > 0: audio_tensor = audio_tensor / torch.abs(audio_tensor).max() - print(f"Final audio tensor: shape={audio_tensor.shape}, min={audio_tensor.min().item():.4f}, max={audio_tensor.max().item():.4f}") return audio_tensor except Exception as e: - print(f"Unhandled error in decode_audio_data: {str(e)}") - # Return a small silent audio segment as fallback - return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence - + logger.error(f"Unhandled error in decode_audio_data: {e}") + return torch.zeros(generator.sample_rate // 2) def encode_audio_data(audio_tensor: torch.Tensor) -> str: """Encode torch tensor audio to base64 string""" - buf = BytesIO() - torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav") - buf.seek(0) - audio_base64 = base64.b64encode(buf.read()).decode('utf-8') - return f"data:audio/wav;base64,{audio_base64}" - + try: + buf = BytesIO() + torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav") + buf.seek(0) + audio_base64 = base64.b64encode(buf.read()).decode('utf-8') + return f"data:audio/wav;base64,{audio_base64}" + except Exception as e: + logger.error(f"Error encoding audio: {e}") + # Return a minimal silent audio file + silence = torch.zeros(generator.sample_rate // 2).unsqueeze(0) + buf = BytesIO() + torchaudio.save(buf, silence, generator.sample_rate, format="wav") + buf.seek(0) + return f"data:audio/wav;base64,{base64.b64encode(buf.read()).decode('utf-8')}" def transcribe_audio(audio_tensor: torch.Tensor) -> str: """Transcribe audio using WhisperX with robust error handling""" - global asr_model # Declare global at the beginning of the function + global asr_model try: # Save the tensor to a temporary file - temp_path = os.path.join(base_dir, "temp_audio.wav") + temp_path = os.path.join(base_dir, f"temp_audio_{time.time()}.wav") torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) - print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)") + logger.info(f"Transcribing audio file: {os.path.getsize(temp_path)} bytes") - # Load the audio file using whisperx's function + # Load the audio for WhisperX try: audio = whisperx.load_audio(temp_path) - except Exception as audio_load_error: - print(f"WhisperX load_audio failed: {str(audio_load_error)}") + except Exception as e: + logger.warning(f"WhisperX load_audio failed: {e}") # Fall back to manual loading import soundfile as sf audio, sr = sf.read(temp_path) @@ -302,59 +330,55 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str: from scipy import signal audio = signal.resample(audio, int(len(audio) * 16000 / sr)) - # Transcribe with error handling for CUDA issues + # Transcribe with error handling try: - # Try with original device - result = asr_model.transcribe(audio, batch_size=8) - except RuntimeError as cuda_error: - if "CUDA" in str(cuda_error) or "libcudnn" in str(cuda_error): - print(f"CUDA error in transcription, falling back to CPU: {str(cuda_error)}") - - # Try to load a CPU model as fallback + result = asr_model.transcribe(audio, batch_size=4) + except RuntimeError as e: + if "CUDA" in str(e) or "libcudnn" in str(e): + logger.warning(f"CUDA error in transcription, falling back to CPU: {e}") try: - # Move model to CPU and try again - asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") - result = asr_model.transcribe(audio, batch_size=1) - except Exception as e: - print(f"CPU fallback also failed: {str(e)}") + # Try CPU model + cpu_model = whisperx.load_model("tiny", "cpu", compute_type="int8") + result = cpu_model.transcribe(audio, batch_size=1) + # Update the global model if the original one is broken + asr_model = cpu_model + except Exception as cpu_e: + logger.error(f"CPU fallback failed: {cpu_e}") return "I'm having trouble processing audio right now." else: - # Re-raise if it's not a CUDA error raise + finally: + # Clean up + if os.path.exists(temp_path): + os.remove(temp_path) - # Clean up + # Extract text from segments + if result["segments"] and len(result["segments"]) > 0: + transcription = " ".join([segment["text"] for segment in result["segments"]]) + logger.info(f"Transcription: '{transcription.strip()}'") + return transcription.strip() + + return "" + except Exception as e: + logger.error(f"Error in transcription: {e}") if os.path.exists(temp_path): os.remove(temp_path) - - # Get the transcription text - if result["segments"] and len(result["segments"]) > 0: - # Combine all segments - transcription = " ".join([segment["text"] for segment in result["segments"]]) - print(f"Transcription successful: '{transcription.strip()}'") - return transcription.strip() - else: - print("Transcription returned no segments") - return "" - except Exception as e: - print(f"Error in transcription: {str(e)}") - import traceback - traceback.print_exc() - if os.path.exists("temp_audio.wav"): - os.remove("temp_audio.wav") return "I heard something but couldn't understand it." - def generate_response(text: str, conversation_history: List[Segment]) -> str: """Generate a contextual response based on the transcribed text""" - # Simple response logic - can be replaced with a more sophisticated LLM in the future + # Simple response logic - can be replaced with a more sophisticated LLM responses = { - "hello": "Hello there! How are you doing today?", + "hello": "Hello there! How can I help you today?", + "hi": "Hi there! What can I do for you?", "how are you": "I'm doing well, thanks for asking! How about you?", "what is your name": "I'm Sesame, your voice assistant. How can I help you?", + "who are you": "I'm Sesame, an AI voice assistant. I'm here to chat with you!", "bye": "Goodbye! It was nice chatting with you.", "thank you": "You're welcome! Is there anything else I can help with?", "weather": "I don't have real-time weather data, but I hope it's nice where you are!", "help": "I can chat with you using natural voice. Just speak normally and I'll respond.", + "what can you do": "I can have a conversation with you, answer questions, and provide assistance with various topics.", } text_lower = text.lower() @@ -372,7 +396,7 @@ def generate_response(text: str, conversation_history: List[Segment]) -> str: else: return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?" -# Flask routes for serving static content +# Flask Routes @app.route('/') def index(): return send_from_directory(base_dir, 'index.html') @@ -391,11 +415,11 @@ def voice_chat_js(): def serve_static(path): return send_from_directory(static_dir, path) -# Socket.IO event handlers +# Socket.IO Event Handlers @socketio.on('connect') def handle_connect(): client_id = request.sid - print(f"Client connected: {client_id}") + logger.info(f"Client connected: {client_id}") # Initialize client context active_clients[client_id] = { @@ -414,7 +438,7 @@ def handle_disconnect(): client_id = request.sid if client_id in active_clients: del active_clients[client_id] - print(f"Client disconnected: {client_id}") + logger.info(f"Client disconnected: {client_id}") @socketio.on('generate') def handle_generate(data): @@ -427,7 +451,7 @@ def handle_generate(data): text = data.get('text', '') speaker_id = data.get('speaker', 0) - print(f"Generating audio for: '{text}' with speaker {speaker_id}") + logger.info(f"Generating audio for: '{text}' with speaker {speaker_id}") # Generate audio response audio_tensor = generator.generate( @@ -446,11 +470,12 @@ def handle_generate(data): audio_base64 = encode_audio_data(audio_tensor) emit('audio_response', { 'type': 'audio_response', - 'audio': audio_base64 + 'audio': audio_base64, + 'text': text }) except Exception as e: - print(f"Error generating audio: {str(e)}") + logger.error(f"Error generating audio: {e}") emit('error', { 'type': 'error', 'message': f"Error generating audio: {str(e)}" @@ -482,7 +507,7 @@ def handle_add_to_context(data): }) except Exception as e: - print(f"Error adding to context: {str(e)}") + logger.error(f"Error adding to context: {e}") emit('error', { 'type': 'error', 'message': f"Error processing audio: {str(e)}" @@ -512,6 +537,11 @@ def handle_stream_audio(data): speaker_id = data.get('speaker', 0) audio_data = data.get('audio', '') + # Skip if no audio data (might be just a connection test) + if not audio_data: + logger.debug("Empty audio data received, ignoring") + return + # Convert received audio to tensor audio_chunk = decode_audio_data(audio_data) @@ -522,7 +552,7 @@ def handle_stream_audio(data): client['energy_window'].clear() client['is_silence'] = False client['last_active_time'] = time.time() - print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}") + logger.info(f"[{client_id[:8]}] Streaming started with speaker ID: {speaker_id}") emit('streaming_status', { 'type': 'streaming_status', 'status': 'started' @@ -553,52 +583,74 @@ def handle_stream_audio(data): if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0: # User has stopped talking - process the collected audio - print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence") + logger.info(f"[{client_id[:8]}] Processing audio after {silence_elapsed:.2f}s of silence") + process_complete_utterance(client_id, client, speaker_id) + + # If buffer gets too large without silence, process it anyway + elif len(client['streaming_buffer']) >= MAX_BUFFER_SIZE: + logger.info(f"[{client_id[:8]}] Processing long audio segment without silence") + process_complete_utterance(client_id, client, speaker_id, is_incomplete=True) - full_audio = torch.cat(client['streaming_buffer'], dim=0) + # Keep half of the buffer for context (sliding window approach) + half_point = len(client['streaming_buffer']) // 2 + client['streaming_buffer'] = client['streaming_buffer'][half_point:] - # Process with WhisperX speech-to-text - print(f"[{client_id}] Starting transcription with WhisperX...") - transcribed_text = transcribe_audio(full_audio) + except Exception as e: + import traceback + traceback.print_exc() + logger.error(f"Error processing streaming audio: {e}") + emit('error', { + 'type': 'error', + 'message': f"Error processing streaming audio: {str(e)}" + }) + +def process_complete_utterance(client_id, client, speaker_id, is_incomplete=False): + """Process a complete utterance (after silence or buffer limit)""" + try: + # Combine audio chunks + full_audio = torch.cat(client['streaming_buffer'], dim=0) + + # Process with speech-to-text + logger.info(f"[{client_id[:8]}] Starting transcription...") + transcribed_text = transcribe_audio(full_audio) + + # Add suffix for incomplete utterances + if is_incomplete: + transcribed_text += " (processing continued speech...)" + + # Log the transcription + logger.info(f"[{client_id[:8]}] Transcribed: '{transcribed_text}'") + + # Handle the transcription result + if transcribed_text: + # Add user message to context + user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) + client['context_segments'].append(user_segment) - # Log the transcription - print(f"[{client_id}] Transcribed text: '{transcribed_text}'") + # Send the transcribed text to client + emit('transcription', { + 'type': 'transcription', + 'text': transcribed_text + }, room=client_id) - # Handle the transcription result - if transcribed_text: - # Add user message to context - user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) - client['context_segments'].append(user_segment) - - # Send the transcribed text to client - emit('transcription', { - 'type': 'transcription', - 'text': transcribed_text - }) - + # Only generate a response if this is a complete utterance + if not is_incomplete: # Generate a contextual response response_text = generate_response(transcribed_text, client['context_segments']) - print(f"[{client_id}] Generating audio response: '{response_text}'") + logger.info(f"[{client_id[:8]}] Generating response: '{response_text}'") # Let the client know we're processing emit('processing_status', { 'type': 'processing_status', 'status': 'generating_audio', 'message': 'Generating audio response...' - }) + }, room=client_id) # Generate audio for the response try: # Use a different speaker than the user ai_speaker_id = 1 if speaker_id == 0 else 0 - # Start audio generation with streaming (chunk by chunk) - audio_chunks = [] - - # This version tries to stream the audio generation in smaller chunks - # Note: CSM model doesn't natively support incremental generation, - # so we're simulating it here for a more responsive UI experience - # Generate the full response audio_tensor = generator.generate( text=response_text, @@ -621,60 +673,37 @@ def handle_stream_audio(data): 'type': 'audio_response', 'text': response_text, 'audio': audio_base64 - }) + }, room=client_id) - print(f"[{client_id}] Audio response sent: {len(audio_base64)} bytes") + logger.info(f"[{client_id[:8]}] Audio response sent") - except Exception as gen_error: - print(f"Error generating audio response: {str(gen_error)}") + except Exception as e: + logger.error(f"Error generating audio response: {e}") emit('error', { 'type': 'error', 'message': "Sorry, there was an error generating the audio response." - }) - else: - # If transcription failed, send a generic response - emit('error', { - 'type': 'error', - 'message': "Sorry, I couldn't understand what you said. Could you try again?" - }) - - # Clear buffer and reset silence detection + }, room=client_id) + else: + # If transcription failed, send a notification + emit('error', { + 'type': 'error', + 'message': "Sorry, I couldn't understand what you said. Could you try again?" + }, room=client_id) + + # Only clear buffer for complete utterances + if not is_incomplete: + # Reset state client['streaming_buffer'] = [] client['energy_window'].clear() client['is_silence'] = False client['last_active_time'] = time.time() - - # If buffer gets too large without silence, process it anyway - elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec - print(f"[{client_id}] Processing long audio segment without silence") - full_audio = torch.cat(client['streaming_buffer'], dim=0) - - # Process with WhisperX speech-to-text - transcribed_text = transcribe_audio(full_audio) - - if transcribed_text: - client['context_segments'].append( - Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) - ) - - # Send the transcribed text to client - emit('transcription', { - 'type': 'transcription', - 'text': transcribed_text + " (processing continued speech...)" - }) - - # Keep half of the buffer for context (sliding window approach) - half_point = len(client['streaming_buffer']) // 2 - client['streaming_buffer'] = client['streaming_buffer'][half_point:] except Exception as e: - import traceback - traceback.print_exc() - print(f"Error processing streaming audio: {str(e)}") + logger.error(f"Error processing utterance: {e}") emit('error', { 'type': 'error', - 'message': f"Error processing streaming audio: {str(e)}" - }) + 'message': f"Error processing audio: {str(e)}" + }, room=client_id) @socketio.on('stop_streaming') def handle_stop_streaming(data): @@ -687,21 +716,8 @@ def handle_stop_streaming(data): if client['streaming_buffer'] and len(client['streaming_buffer']) > 5: # Process any remaining audio in the buffer - full_audio = torch.cat(client['streaming_buffer'], dim=0) - - # Process with WhisperX speech-to-text - transcribed_text = transcribe_audio(full_audio) - - if transcribed_text: - client['context_segments'].append( - Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio) - ) - - # Send the transcribed text to client - emit('transcription', { - 'type': 'transcription', - 'text': transcribed_text - }) + logger.info(f"[{client_id[:8]}] Processing final audio buffer on stop") + process_complete_utterance(client_id, client, data.get("speaker", 0)) client['streaming_buffer'] = [] emit('streaming_status', { @@ -709,18 +725,18 @@ def handle_stop_streaming(data): 'status': 'stopped' }) -def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=500): +def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=CHUNK_SIZE_MS): """Stream audio to client in chunks to simulate real-time generation""" try: if client_id not in active_clients: - print(f"Client {client_id} not found for streaming") + logger.warning(f"Client {client_id} not found for streaming") return # Calculate chunk size in samples chunk_size = int(generator.sample_rate * chunk_size_ms / 1000) total_chunks = math.ceil(audio_tensor.size(0) / chunk_size) - print(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each") + logger.info(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each") # Send initial response with text but no audio yet socketio.emit('audio_response_start', { @@ -758,29 +774,24 @@ def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size 'text': text }, room=client_id) - print(f"Audio streaming complete: {total_chunks} chunks sent") + logger.info(f"Audio streaming complete: {total_chunks} chunks sent") except Exception as e: - print(f"Error streaming audio to client: {str(e)}") + logger.error(f"Error streaming audio to client: {e}") import traceback traceback.print_exc() +# Main server start if __name__ == "__main__": print(f"\n{'='*60}") - print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)") + print(f"🔊 Sesame AI Voice Chat Server") print(f"{'='*60}") print(f"📡 Server Information:") print(f" - Local URL: http://localhost:5000") print(f" - Network URL: http://:5000") - print(f" - WebSocket: ws://:5000/socket.io") - print(f"{'='*60}") - print(f"💡 To make this server public:") - print(f" 1. Ensure port 5000 is open in your firewall") - print(f" 2. Set up port forwarding on your router to port 5000") - print(f" 3. Or use a service like ngrok with: ngrok http 5000") print(f"{'='*60}") print(f"🌐 Device: {device.upper()}") - print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})") + print(f"🧠 Models: Sesame CSM + WhisperX ASR") print(f"🔧 Serving from: {os.path.join(base_dir, 'index.html')}") print(f"{'='*60}") print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n") From 8592257cdc5c073bdf88b2dd0311ad0c2af0b957 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 00:33:14 -0400 Subject: [PATCH 16/16] Demo Update 7 --- Backend/server.py | 254 ++++++++++++++-------------------------------- 1 file changed, 77 insertions(+), 177 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index 8ba56b4..8f4e278 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -8,7 +8,6 @@ import logging import numpy as np import torch import torchaudio -import whisperx from io import BytesIO from typing import List, Dict, Any, Optional from flask import Flask, request, send_from_directory, Response @@ -25,68 +24,24 @@ logging.basicConfig( ) logger = logging.getLogger("sesame-server") -# CUDA Environment Setup -def setup_cuda_environment(): - """Set up CUDA environment with proper error handling""" - # Search for CUDA libraries in common locations - cuda_lib_dirs = [ - "/usr/local/cuda/lib64", - "/usr/lib/x86_64-linux-gnu", - "/usr/local/cuda/extras/CUPTI/lib64" - ] - - # Add directories to LD_LIBRARY_PATH if they exist - current_ld_path = os.environ.get('LD_LIBRARY_PATH', '') - for cuda_dir in cuda_lib_dirs: - if os.path.exists(cuda_dir) and cuda_dir not in current_ld_path: - if current_ld_path: - os.environ['LD_LIBRARY_PATH'] = f"{current_ld_path}:{cuda_dir}" - else: - os.environ['LD_LIBRARY_PATH'] = cuda_dir - current_ld_path = os.environ['LD_LIBRARY_PATH'] - - logger.info(f"LD_LIBRARY_PATH set to: {os.environ.get('LD_LIBRARY_PATH', 'not set')}") - - # Determine best compute device - device = "cpu" - compute_type = "int8" - +# Determine best compute device +if torch.backends.mps.is_available(): + device = "mps" +elif torch.cuda.is_available(): try: - # Set CUDA preferences - os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only - - # Try enabling TF32 precision if available - try: - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.allow_tf32 = True - torch.backends.cudnn.enabled = True - torch.backends.cudnn.benchmark = True - except Exception as e: - logger.warning(f"Could not set advanced CUDA options: {e}") - - # Test if CUDA is functional - if torch.cuda.is_available(): - try: - # Test basic CUDA operations - x = torch.rand(10, device="cuda") - y = x + x - del x, y - torch.cuda.empty_cache() - device = "cuda" - compute_type = "float16" - logger.info("CUDA is fully functional") - except Exception as e: - logger.warning(f"CUDA available but not working correctly: {e}") - device = "cpu" - else: - logger.info("CUDA is not available, using CPU") + # Test CUDA functionality + torch.rand(10, device="cuda") + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + torch.backends.cudnn.benchmark = True + device = "cuda" + logger.info("CUDA is fully functional") except Exception as e: - logger.error(f"Error setting up computing environment: {e}") - - return device, compute_type - -# Set up the compute environment -device, compute_type = setup_cuda_environment() + logger.warning(f"CUDA available but not working correctly: {e}") + device = "cpu" +else: + device = "cpu" + logger.info("Using CPU") # Constants and Configuration SILENCE_THRESHOLD = 0.01 @@ -99,9 +54,37 @@ base_dir = os.path.dirname(os.path.abspath(__file__)) static_dir = os.path.join(base_dir, "static") os.makedirs(static_dir, exist_ok=True) +# Define a simple energy-based speech detector +class SpeechDetector: + def __init__(self): + self.min_speech_energy = 0.01 + self.speech_window = 0.2 # seconds + + def detect_speech(self, audio_tensor, sample_rate): + # Calculate frame size based on window size + frame_size = int(sample_rate * self.speech_window) + + # If audio is shorter than frame size, use the entire audio + if audio_tensor.shape[0] < frame_size: + frames = [audio_tensor] + else: + # Split audio into frames + frames = [audio_tensor[i:i+frame_size] for i in range(0, len(audio_tensor), frame_size)] + + # Calculate energy per frame + energies = [torch.mean(frame**2).item() for frame in frames] + + # Determine if there's speech based on energy threshold + has_speech = any(e > self.min_speech_energy for e in energies) + + return has_speech + +speech_detector = SpeechDetector() +logger.info("Initialized simple speech detector") + # Model Loading Functions def load_speech_models(): - """Load all required speech models with fallbacks""" + """Load speech generation model""" # Load speech generation model (Sesame CSM) try: logger.info(f"Loading Sesame CSM model on {device}...") @@ -120,52 +103,10 @@ def load_speech_models(): else: raise RuntimeError("Failed to load speech synthesis model on any device") - # Load ASR model (WhisperX) - try: - logger.info("Loading WhisperX model...") - # Start with the tiny model on CPU for reliable initialization - asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8") - logger.info("WhisperX 'tiny' model loaded on CPU successfully") - - # Try upgrading to GPU if available - if device == "cuda": - try: - logger.info("Trying to load WhisperX on CUDA...") - # Test with a tiny model first - test_audio = torch.zeros(16000) # 1 second of silence - - cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16") - # Test the model with real inference - _ = cuda_model.transcribe(test_audio.numpy(), batch_size=1) - asr_model = cuda_model - logger.info("WhisperX model running on CUDA successfully") - - # Try to upgrade to small model - try: - small_model = whisperx.load_model("small", "cuda", compute_type="float16") - _ = small_model.transcribe(test_audio.numpy(), batch_size=1) - asr_model = small_model - logger.info("WhisperX 'small' model loaded on CUDA successfully") - except Exception as e: - logger.warning(f"Staying with 'tiny' model on CUDA: {e}") - except Exception as e: - logger.warning(f"CUDA loading failed, staying with CPU model: {e}") - except Exception as e: - logger.error(f"Error loading WhisperX model: {e}") - # Create a minimal dummy model as last resort - class DummyModel: - def __init__(self): - self.device = "cpu" - def transcribe(self, *args, **kwargs): - return {"segments": [{"text": "Speech recognition currently unavailable."}]} - - asr_model = DummyModel() - logger.warning("Using dummy transcription model - ASR functionality limited") - - return generator, asr_model + return generator -# Load speech models -generator, asr_model = load_speech_models() +# Load speech model +generator = load_speech_models() # Set up Flask and Socket.IO app = Flask(__name__) @@ -307,63 +248,23 @@ def encode_audio_data(audio_tensor: torch.Tensor) -> str: buf.seek(0) return f"data:audio/wav;base64,{base64.b64encode(buf.read()).decode('utf-8')}" -def transcribe_audio(audio_tensor: torch.Tensor) -> str: - """Transcribe audio using WhisperX with robust error handling""" - global asr_model +def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str: + """Process speech and return a simple response""" + # In this simplified version, we'll just check if there's sound + # and provide basic responses instead of doing actual speech recognition - try: - # Save the tensor to a temporary file - temp_path = os.path.join(base_dir, f"temp_audio_{time.time()}.wav") - torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) + if speech_detector and speech_detector.detect_speech(audio_tensor, generator.sample_rate): + # Generate a response based on audio energy + energy = torch.mean(torch.abs(audio_tensor)).item() - logger.info(f"Transcribing audio file: {os.path.getsize(temp_path)} bytes") - - # Load the audio for WhisperX - try: - audio = whisperx.load_audio(temp_path) - except Exception as e: - logger.warning(f"WhisperX load_audio failed: {e}") - # Fall back to manual loading - import soundfile as sf - audio, sr = sf.read(temp_path) - if sr != 16000: # WhisperX expects 16kHz audio - from scipy import signal - audio = signal.resample(audio, int(len(audio) * 16000 / sr)) - - # Transcribe with error handling - try: - result = asr_model.transcribe(audio, batch_size=4) - except RuntimeError as e: - if "CUDA" in str(e) or "libcudnn" in str(e): - logger.warning(f"CUDA error in transcription, falling back to CPU: {e}") - try: - # Try CPU model - cpu_model = whisperx.load_model("tiny", "cpu", compute_type="int8") - result = cpu_model.transcribe(audio, batch_size=1) - # Update the global model if the original one is broken - asr_model = cpu_model - except Exception as cpu_e: - logger.error(f"CPU fallback failed: {cpu_e}") - return "I'm having trouble processing audio right now." - else: - raise - finally: - # Clean up - if os.path.exists(temp_path): - os.remove(temp_path) - - # Extract text from segments - if result["segments"] and len(result["segments"]) > 0: - transcription = " ".join([segment["text"] for segment in result["segments"]]) - logger.info(f"Transcription: '{transcription.strip()}'") - return transcription.strip() - - return "" - except Exception as e: - logger.error(f"Error in transcription: {e}") - if os.path.exists(temp_path): - os.remove(temp_path) - return "I heard something but couldn't understand it." + if energy > 0.1: # Louder speech + return "I heard you speaking clearly. How can I help you today?" + elif energy > 0.05: # Moderate speech + return "I heard you say something. Could you please repeat that?" + else: # Soft speech + return "I detected some speech, but it was quite soft. Could you speak up a bit?" + else: + return "I didn't detect any speech. Could you please try again?" def generate_response(text: str, conversation_history: List[Segment]) -> str: """Generate a contextual response based on the transcribed text""" @@ -394,7 +295,7 @@ def generate_response(text: str, conversation_history: List[Segment]) -> str: elif len(text) < 10: return "Thanks for your message. Could you elaborate a bit more?" else: - return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?" + return f"I heard you speaking. That's interesting! Can you tell me more about that?" # Flask Routes @app.route('/') @@ -610,33 +511,32 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals # Combine audio chunks full_audio = torch.cat(client['streaming_buffer'], dim=0) - # Process with speech-to-text - logger.info(f"[{client_id[:8]}] Starting transcription...") - transcribed_text = transcribe_audio(full_audio) + # Process audio to generate a response (no speech recognition) + generated_text = process_speech(full_audio, client_id) # Add suffix for incomplete utterances if is_incomplete: - transcribed_text += " (processing continued speech...)" + generated_text += " (processing continued speech...)" - # Log the transcription - logger.info(f"[{client_id[:8]}] Transcribed: '{transcribed_text}'") + # Log the generated text + logger.info(f"[{client_id[:8]}] Generated text: '{generated_text}'") - # Handle the transcription result - if transcribed_text: + # Handle the result + if generated_text: # Add user message to context - user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) + user_segment = Segment(text=generated_text, speaker=speaker_id, audio=full_audio) client['context_segments'].append(user_segment) - # Send the transcribed text to client + # Send the text to client emit('transcription', { 'type': 'transcription', - 'text': transcribed_text + 'text': generated_text }, room=client_id) # Only generate a response if this is a complete utterance if not is_incomplete: # Generate a contextual response - response_text = generate_response(transcribed_text, client['context_segments']) + response_text = generate_response(generated_text, client['context_segments']) logger.info(f"[{client_id[:8]}] Generating response: '{response_text}'") # Let the client know we're processing @@ -684,7 +584,7 @@ def process_complete_utterance(client_id, client, speaker_id, is_incomplete=Fals 'message': "Sorry, there was an error generating the audio response." }, room=client_id) else: - # If transcription failed, send a notification + # If processing failed, send a notification emit('error', { 'type': 'error', 'message': "Sorry, I couldn't understand what you said. Could you try again?" @@ -791,7 +691,7 @@ if __name__ == "__main__": print(f" - Network URL: http://:5000") print(f"{'='*60}") print(f"🌐 Device: {device.upper()}") - print(f"🧠 Models: Sesame CSM + WhisperX ASR") + print(f"🧠 Models: Sesame CSM (TTS only)") print(f"🔧 Serving from: {os.path.join(base_dir, 'index.html')}") print(f"{'='*60}") print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")