This commit is contained in:
Surya Vemulapalli
2025-03-30 00:20:29 -04:00
3 changed files with 1955 additions and 958 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,99 +1,276 @@
import os
import base64
import json
import asyncio
import torch
import torchaudio
import numpy as np
import io
import whisperx
from io import BytesIO
from typing import List, Dict, Any, Optional
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from flask import Flask, request, send_from_directory, Response
from flask_cors import CORS
from flask_socketio import SocketIO, emit, disconnect
from generator import load_csm_1b, Segment
import uvicorn
import time
import gc
from collections import deque
from threading import Lock
# Select device
# Add this at the top of your file, replacing your current CUDA setup
# CUDA setup with robust error handling
try:
# Handle CUDA issues
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only
# Try enabling TF32 precision
try:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
except:
pass # Ignore if not supported
# Check if CUDA is available
if torch.cuda.is_available():
try:
# Test CUDA functionality
x = torch.rand(10, device="cuda")
y = x + x
del x, y
device = "cuda"
compute_type = "float16"
print("CUDA is fully functional")
except Exception as cuda_error:
print(f"CUDA is available but not working correctly: {str(cuda_error)}")
device = "cpu"
compute_type = "int8"
else:
device = "cpu"
print(f"Using device: {device}")
compute_type = "int8"
except Exception as e:
print(f"Error setting up CUDA: {str(e)}")
device = "cpu"
compute_type = "int8"
# Initialize the model
print(f"Using device: {device} with compute type: {compute_type}")
# Initialize the Sesame CSM model with robust error handling
try:
print(f"Loading Sesame CSM model on {device}...")
generator = load_csm_1b(device=device)
print("Sesame CSM model loaded successfully")
except Exception as model_error:
print(f"Error loading Sesame CSM on {device}: {str(model_error)}")
if device == "cuda":
# Try on CPU as fallback
try:
print("Trying to load Sesame CSM on CPU instead...")
device = "cpu" # Update global device setting
generator = load_csm_1b(device="cpu")
print("Sesame CSM model loaded on CPU successfully")
except Exception as cpu_error:
print(f"Fatal error - could not load Sesame CSM model: {str(cpu_error)}")
raise RuntimeError("Failed to load speech synthesis model")
else:
# Already tried CPU and it failed
raise RuntimeError("Failed to load speech synthesis model on any device")
# Initialize WhisperX for ASR
# Replace the WhisperX model loading section
# Initialize WhisperX for ASR with robust error handling
print("Loading WhisperX model...")
# Use a smaller model for faster response times
asr_model = whisperx.load_model("medium", device, compute_type="float16")
print("WhisperX model loaded!")
asr_model = None # Initialize to None first to avoid scope issues
app = FastAPI()
try:
# Always start with the tiny model on CPU for stability
asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
print("WhisperX 'tiny' model loaded on CPU successfully")
# Add CORS middleware to allow cross-origin requests
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins in development
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# If CPU works, try CUDA if available
if device == "cuda":
try:
print("Trying to load WhisperX on CUDA...")
cuda_model = whisperx.load_model("tiny", "cuda", compute_type="float16")
# Test the model to ensure it works
test_audio = torch.zeros(16000) # 1 second of silence at 16kHz
_ = cuda_model.transcribe(test_audio.numpy(), batch_size=1)
# If we get here, CUDA works
asr_model = cuda_model
print("WhisperX model moved to CUDA successfully")
# Connection manager to handle multiple clients
class ConnectionManager:
# Try to upgrade to small model on CUDA
try:
small_model = whisperx.load_model("small", "cuda", compute_type="float16")
# Test it
_ = small_model.transcribe(test_audio.numpy(), batch_size=1)
asr_model = small_model
print("WhisperX 'small' model loaded on CUDA successfully")
except Exception as upgrade_error:
print(f"Staying with 'tiny' model on CUDA: {str(upgrade_error)}")
except Exception as cuda_error:
print(f"CUDA loading failed, staying with CPU model: {str(cuda_error)}")
except Exception as e:
print(f"Error loading WhisperX model: {str(e)}")
# Create a minimal dummy model as last resort
class DummyModel:
def __init__(self):
self.active_connections: List[WebSocket] = []
self.device = "cpu"
def transcribe(self, *args, **kwargs):
return {"segments": [{"text": "Speech recognition currently unavailable."}]}
async def connect(self, websocket: WebSocket):
await websocket.accept()
self.active_connections.append(websocket)
def disconnect(self, websocket: WebSocket):
self.active_connections.remove(websocket)
manager = ConnectionManager()
asr_model = DummyModel()
print("WARNING: Using dummy transcription model - ASR functionality limited")
# Silence detection parameters
SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization
SILENCE_DURATION_SEC = 1.0 # How long silence must persist to be considered "stopped talking"
SILENCE_DURATION_SEC = 1.0 # How long silence must persist
# Define the base directory
base_dir = os.path.dirname(os.path.abspath(__file__))
static_dir = os.path.join(base_dir, "static")
os.makedirs(static_dir, exist_ok=True)
# Setup Flask
app = Flask(__name__)
CORS(app)
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
# Socket connection management
thread = None
thread_lock = Lock()
active_clients = {} # Map client_id to client context
# Helper function to convert audio data
async def decode_audio_data(audio_data: str) -> torch.Tensor:
"""Decode base64 audio data to a torch tensor"""
def decode_audio_data(audio_data: str) -> torch.Tensor:
"""Decode base64 audio data to a torch tensor with improved error handling"""
try:
# Skip empty audio data
if not audio_data or len(audio_data) < 100:
print("Empty or too short audio data received")
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
# Extract the actual base64 content
if ',' in audio_data:
# Handle data URL format (data:audio/wav;base64,...)
audio_data = audio_data.split(',')[1]
# Decode base64 audio data
binary_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data)
try:
binary_data = base64.b64decode(audio_data)
print(f"Decoded base64 data: {len(binary_data)} bytes")
# Save to a temporary WAV file first
temp_file = BytesIO(binary_data)
# Check if we have enough data for a valid WAV
if len(binary_data) < 44: # WAV header is 44 bytes
print("Data too small to be a valid WAV file")
return torch.zeros(generator.sample_rate // 2)
except Exception as e:
print(f"Base64 decoding error: {str(e)}")
return torch.zeros(generator.sample_rate // 2)
# Load audio from binary data, explicitly specifying the format
# Save for debugging
debug_path = os.path.join(base_dir, "debug_incoming.wav")
with open(debug_path, 'wb') as f:
f.write(binary_data)
print(f"Saved debug file: {debug_path}")
# Approach 1: Load directly with torchaudio
try:
with BytesIO(binary_data) as temp_file:
temp_file.seek(0) # Ensure we're at the start of the buffer
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
print(f"Direct loading success: shape={audio_tensor.shape}, rate={sample_rate}Hz")
# Check if audio is valid
if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
raise ValueError("Empty or invalid audio tensor detected")
except Exception as e:
print(f"Direct loading failed: {str(e)}")
# Approach 2: Try to fix/normalize the WAV data
try:
# Sometimes WAV headers can be malformed, attempt to fix
temp_path = os.path.join(base_dir, "temp_fixing.wav")
with open(temp_path, 'wb') as f:
f.write(binary_data)
# Use a simpler numpy approach as backup
import numpy as np
import wave
try:
with wave.open(temp_path, 'rb') as wf:
n_channels = wf.getnchannels()
sample_width = wf.getsampwidth()
sample_rate = wf.getframerate()
n_frames = wf.getnframes()
# Read the frames
frames = wf.readframes(n_frames)
print(f"Wave reading: channels={n_channels}, rate={sample_rate}Hz, frames={n_frames}")
# Convert to numpy and then to torch
if sample_width == 2: # 16-bit audio
data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
elif sample_width == 1: # 8-bit audio
data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0
else:
raise ValueError(f"Unsupported sample width: {sample_width}")
# Convert to mono if needed
if n_channels > 1:
data = data.reshape(-1, n_channels)
data = data.mean(axis=1)
# Convert to torch tensor
audio_tensor = torch.from_numpy(data)
print(f"Successfully converted with numpy: shape={audio_tensor.shape}")
except Exception as wave_error:
print(f"Wave processing failed: {str(wave_error)}")
# Try with torchaudio as last resort
audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
# Clean up
if os.path.exists(temp_path):
os.remove(temp_path)
except Exception as e2:
print(f"All WAV loading methods failed: {str(e2)}")
print("Returning silence as fallback")
return torch.zeros(generator.sample_rate // 2)
# Ensure audio is the right shape (mono)
if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1:
audio_tensor = torch.mean(audio_tensor, dim=0)
# Ensure we have a 1D tensor
audio_tensor = audio_tensor.squeeze()
# Resample if needed
if sample_rate != generator.sample_rate:
audio_tensor = torchaudio.functional.resample(
audio_tensor.squeeze(0),
try:
print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
resampler = torchaudio.transforms.Resample(
orig_freq=sample_rate,
new_freq=generator.sample_rate
)
else:
audio_tensor = audio_tensor.squeeze(0)
audio_tensor = resampler(audio_tensor)
except Exception as e:
print(f"Resampling error: {str(e)}")
# If resampling fails, just return the original audio
# The model can often handle different sample rates
# Normalize audio to avoid issues
if torch.abs(audio_tensor).max() > 0:
audio_tensor = audio_tensor / torch.abs(audio_tensor).max()
print(f"Final audio tensor: shape={audio_tensor.shape}, min={audio_tensor.min().item():.4f}, max={audio_tensor.max().item():.4f}")
return audio_tensor
except Exception as e:
print(f"Error decoding audio: {str(e)}")
print(f"Unhandled error in decode_audio_data: {str(e)}")
# Return a small silent audio segment as fallback
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
def encode_audio_data(audio_tensor: torch.Tensor) -> str:
"""Encode torch tensor audio to base64 string"""
buf = BytesIO()
torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
@@ -102,40 +279,72 @@ async def encode_audio_data(audio_tensor: torch.Tensor) -> str:
return f"data:audio/wav;base64,{audio_base64}"
async def transcribe_audio(audio_tensor: torch.Tensor) -> str:
"""Transcribe audio using WhisperX"""
def transcribe_audio(audio_tensor: torch.Tensor) -> str:
"""Transcribe audio using WhisperX with robust error handling"""
global asr_model # Declare global at the beginning of the function
try:
# Save the tensor to a temporary file
temp_file = BytesIO()
torchaudio.save(temp_file, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav")
temp_file.seek(0)
temp_path = os.path.join(base_dir, "temp_audio.wav")
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
# Create a temporary file on disk (WhisperX requires a file path)
temp_path = "temp_audio.wav"
with open(temp_path, "wb") as f:
f.write(temp_file.read())
print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)")
# Load and transcribe the audio
# Load the audio file using whisperx's function
try:
audio = whisperx.load_audio(temp_path)
result = asr_model.transcribe(audio, batch_size=16)
except Exception as audio_load_error:
print(f"WhisperX load_audio failed: {str(audio_load_error)}")
# Fall back to manual loading
import soundfile as sf
audio, sr = sf.read(temp_path)
if sr != 16000: # WhisperX expects 16kHz audio
from scipy import signal
audio = signal.resample(audio, int(len(audio) * 16000 / sr))
# Transcribe with error handling for CUDA issues
try:
# Try with original device
result = asr_model.transcribe(audio, batch_size=8)
except RuntimeError as cuda_error:
if "CUDA" in str(cuda_error) or "libcudnn" in str(cuda_error):
print(f"CUDA error in transcription, falling back to CPU: {str(cuda_error)}")
# Try to load a CPU model as fallback
try:
# Move model to CPU and try again
asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
result = asr_model.transcribe(audio, batch_size=1)
except Exception as e:
print(f"CPU fallback also failed: {str(e)}")
return "I'm having trouble processing audio right now."
else:
# Re-raise if it's not a CUDA error
raise
# Clean up
if os.path.exists(temp_path):
os.remove(temp_path)
# Get the transcription text
if result["segments"] and len(result["segments"]) > 0:
# Combine all segments
transcription = " ".join([segment["text"] for segment in result["segments"]])
print(f"Transcription: {transcription}")
print(f"Transcription successful: '{transcription.strip()}'")
return transcription.strip()
else:
print("Transcription returned no segments")
return ""
except Exception as e:
print(f"Error in transcription: {str(e)}")
return ""
import traceback
traceback.print_exc()
if os.path.exists("temp_audio.wav"):
os.remove("temp_audio.wav")
return "I heard something but couldn't understand it."
async def generate_response(text: str, conversation_history: List[Segment]) -> str:
def generate_response(text: str, conversation_history: List[Segment]) -> str:
"""Generate a contextual response based on the transcribed text"""
# Simple response logic - can be replaced with a more sophisticated LLM in the future
responses = {
@@ -163,255 +372,417 @@ async def generate_response(text: str, conversation_history: List[Segment]) -> s
else:
return f"I understand you said '{text}'. That's interesting! Can you tell me more about that?"
# Flask routes for serving static content
@app.route('/')
def index():
return send_from_directory(base_dir, 'index.html')
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
await manager.connect(websocket)
context_segments = [] # Store conversation context
streaming_buffer = [] # Buffer for streaming audio chunks
is_streaming = False
@app.route('/favicon.ico')
def favicon():
if os.path.exists(os.path.join(static_dir, 'favicon.ico')):
return send_from_directory(static_dir, 'favicon.ico')
return Response(status=204)
# Variables for silence detection
last_active_time = time.time()
is_silence = False
energy_window = deque(maxlen=10) # For tracking recent audio energy
@app.route('/voice-chat.js')
def voice_chat_js():
return send_from_directory(base_dir, 'voice-chat.js')
@app.route('/static/<path:path>')
def serve_static(path):
return send_from_directory(static_dir, path)
# Socket.IO event handlers
@socketio.on('connect')
def handle_connect():
client_id = request.sid
print(f"Client connected: {client_id}")
# Initialize client context
active_clients[client_id] = {
'context_segments': [],
'streaming_buffer': [],
'is_streaming': False,
'is_silence': False,
'last_active_time': time.time(),
'energy_window': deque(maxlen=10)
}
emit('status', {'type': 'connected', 'message': 'Connected to server'})
@socketio.on('disconnect')
def handle_disconnect():
client_id = request.sid
if client_id in active_clients:
del active_clients[client_id]
print(f"Client disconnected: {client_id}")
@socketio.on('generate')
def handle_generate(data):
client_id = request.sid
if client_id not in active_clients:
emit('error', {'message': 'Client not registered'})
return
try:
while True:
# Receive JSON data from client
data = await websocket.receive_text()
request = json.loads(data)
text = data.get('text', '')
speaker_id = data.get('speaker', 0)
action = request.get("action")
if action == "generate":
try:
text = request.get("text", "")
speaker_id = request.get("speaker", 0)
print(f"Generating audio for: '{text}' with speaker {speaker_id}")
# Generate audio response
print(f"Generating audio for: '{text}' with speaker {speaker_id}")
audio_tensor = generator.generate(
text=text,
speaker=speaker_id,
context=context_segments,
context=active_clients[client_id]['context_segments'],
max_audio_length_ms=10_000,
)
# Add to conversation context
context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
active_clients[client_id]['context_segments'].append(
Segment(text=text, speaker=speaker_id, audio=audio_tensor)
)
# Convert audio to base64 and send back to client
audio_base64 = await encode_audio_data(audio_tensor)
await websocket.send_json({
"type": "audio_response",
"audio": audio_base64
audio_base64 = encode_audio_data(audio_tensor)
emit('audio_response', {
'type': 'audio_response',
'audio': audio_base64
})
except Exception as e:
print(f"Error generating audio: {str(e)}")
await websocket.send_json({
"type": "error",
"message": f"Error generating audio: {str(e)}"
emit('error', {
'type': 'error',
'message': f"Error generating audio: {str(e)}"
})
elif action == "add_to_context":
@socketio.on('add_to_context')
def handle_add_to_context(data):
client_id = request.sid
if client_id not in active_clients:
emit('error', {'message': 'Client not registered'})
return
try:
text = request.get("text", "")
speaker_id = request.get("speaker", 0)
audio_data = request.get("audio", "")
text = data.get('text', '')
speaker_id = data.get('speaker', 0)
audio_data = data.get('audio', '')
# Convert received audio to tensor
audio_tensor = await decode_audio_data(audio_data)
audio_tensor = decode_audio_data(audio_data)
# Add to conversation context
context_segments.append(Segment(text=text, speaker=speaker_id, audio=audio_tensor))
active_clients[client_id]['context_segments'].append(
Segment(text=text, speaker=speaker_id, audio=audio_tensor)
)
await websocket.send_json({
"type": "context_updated",
"message": "Audio added to context"
emit('context_updated', {
'type': 'context_updated',
'message': 'Audio added to context'
})
except Exception as e:
print(f"Error adding to context: {str(e)}")
await websocket.send_json({
"type": "error",
"message": f"Error processing audio: {str(e)}"
emit('error', {
'type': 'error',
'message': f"Error processing audio: {str(e)}"
})
elif action == "clear_context":
context_segments = []
await websocket.send_json({
"type": "context_updated",
"message": "Context cleared"
@socketio.on('clear_context')
def handle_clear_context():
client_id = request.sid
if client_id in active_clients:
active_clients[client_id]['context_segments'] = []
emit('context_updated', {
'type': 'context_updated',
'message': 'Context cleared'
})
elif action == "stream_audio":
@socketio.on('stream_audio')
def handle_stream_audio(data):
client_id = request.sid
if client_id not in active_clients:
emit('error', {'message': 'Client not registered'})
return
client = active_clients[client_id]
try:
speaker_id = request.get("speaker", 0)
audio_data = request.get("audio", "")
speaker_id = data.get('speaker', 0)
audio_data = data.get('audio', '')
# Convert received audio to tensor
audio_chunk = await decode_audio_data(audio_data)
audio_chunk = decode_audio_data(audio_data)
# Start streaming mode if not already started
if not is_streaming:
is_streaming = True
streaming_buffer = []
energy_window.clear()
is_silence = False
last_active_time = time.time()
await websocket.send_json({
"type": "streaming_status",
"status": "started"
if not client['is_streaming']:
client['is_streaming'] = True
client['streaming_buffer'] = []
client['energy_window'].clear()
client['is_silence'] = False
client['last_active_time'] = time.time()
print(f"[{client_id}] Streaming started with speaker ID: {speaker_id}")
emit('streaming_status', {
'type': 'streaming_status',
'status': 'started'
})
# Calculate audio energy for silence detection
chunk_energy = torch.mean(torch.abs(audio_chunk)).item()
energy_window.append(chunk_energy)
avg_energy = sum(energy_window) / len(energy_window)
client['energy_window'].append(chunk_energy)
avg_energy = sum(client['energy_window']) / len(client['energy_window'])
# Check if audio is silent
current_silence = avg_energy < SILENCE_THRESHOLD
# Track silence transition
if not is_silence and current_silence:
if not client['is_silence'] and current_silence:
# Transition to silence
is_silence = True
last_active_time = time.time()
elif is_silence and not current_silence:
client['is_silence'] = True
client['last_active_time'] = time.time()
elif client['is_silence'] and not current_silence:
# User started talking again
is_silence = False
client['is_silence'] = False
# Add chunk to buffer regardless of silence state
streaming_buffer.append(audio_chunk)
client['streaming_buffer'].append(audio_chunk)
# Check if silence has persisted long enough to consider "stopped talking"
silence_elapsed = time.time() - last_active_time
silence_elapsed = time.time() - client['last_active_time']
if is_silence and silence_elapsed >= SILENCE_DURATION_SEC and len(streaming_buffer) > 0:
if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0:
# User has stopped talking - process the collected audio
full_audio = torch.cat(streaming_buffer, dim=0)
print(f"[{client_id}] Processing audio after {silence_elapsed:.2f}s of silence")
full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process with WhisperX speech-to-text
transcribed_text = await transcribe_audio(full_audio)
print(f"[{client_id}] Starting transcription with WhisperX...")
transcribed_text = transcribe_audio(full_audio)
# Log the transcription
print(f"Transcribed text: '{transcribed_text}'")
print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
# Add to conversation context
# Handle the transcription result
if transcribed_text:
# Add user message to context
user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
context_segments.append(user_segment)
# Generate a contextual response
response_text = await generate_response(transcribed_text, context_segments)
client['context_segments'].append(user_segment)
# Send the transcribed text to client
await websocket.send_json({
"type": "transcription",
"text": transcribed_text
emit('transcription', {
'type': 'transcription',
'text': transcribed_text
})
# Generate a contextual response
response_text = generate_response(transcribed_text, client['context_segments'])
print(f"[{client_id}] Generating audio response: '{response_text}'")
# Let the client know we're processing
emit('processing_status', {
'type': 'processing_status',
'status': 'generating_audio',
'message': 'Generating audio response...'
})
# Generate audio for the response
try:
# Use a different speaker than the user
ai_speaker_id = 1 if speaker_id == 0 else 0
# Start audio generation with streaming (chunk by chunk)
audio_chunks = []
# This version tries to stream the audio generation in smaller chunks
# Note: CSM model doesn't natively support incremental generation,
# so we're simulating it here for a more responsive UI experience
# Generate the full response
audio_tensor = generator.generate(
text=response_text,
speaker=1 if speaker_id == 0 else 0, # Use opposite speaker
context=context_segments,
speaker=ai_speaker_id,
context=client['context_segments'],
max_audio_length_ms=10_000,
)
# Add response to context
ai_segment = Segment(
text=response_text,
speaker=1 if speaker_id == 0 else 0,
speaker=ai_speaker_id,
audio=audio_tensor
)
context_segments.append(ai_segment)
client['context_segments'].append(ai_segment)
# Convert audio to base64 and send back to client
audio_base64 = await encode_audio_data(audio_tensor)
await websocket.send_json({
"type": "audio_response",
"text": response_text,
"audio": audio_base64
audio_base64 = encode_audio_data(audio_tensor)
emit('audio_response', {
'type': 'audio_response',
'text': response_text,
'audio': audio_base64
})
print(f"[{client_id}] Audio response sent: {len(audio_base64)} bytes")
except Exception as gen_error:
print(f"Error generating audio response: {str(gen_error)}")
emit('error', {
'type': 'error',
'message': "Sorry, there was an error generating the audio response."
})
else:
# If transcription failed, send a generic response
await websocket.send_json({
"type": "error",
"message": "Sorry, I couldn't understand what you said. Could you try again?"
emit('error', {
'type': 'error',
'message': "Sorry, I couldn't understand what you said. Could you try again?"
})
# Clear buffer and reset silence detection
streaming_buffer = []
energy_window.clear()
is_silence = False
last_active_time = time.time()
client['streaming_buffer'] = []
client['energy_window'].clear()
client['is_silence'] = False
client['last_active_time'] = time.time()
# If buffer gets too large without silence, process it anyway
# This prevents memory issues with very long streams
elif len(streaming_buffer) >= 30: # ~6 seconds of audio at 5 chunks/sec
print("Buffer limit reached, processing audio")
full_audio = torch.cat(streaming_buffer, dim=0)
elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec
print(f"[{client_id}] Processing long audio segment without silence")
full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process with WhisperX speech-to-text
transcribed_text = await transcribe_audio(full_audio)
transcribed_text = transcribe_audio(full_audio)
if transcribed_text:
context_segments.append(Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio))
client['context_segments'].append(
Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
)
# Send the transcribed text to client
await websocket.send_json({
"type": "transcription",
"text": transcribed_text + " (processing continued speech...)"
emit('transcription', {
'type': 'transcription',
'text': transcribed_text + " (processing continued speech...)"
})
streaming_buffer = []
# Keep half of the buffer for context (sliding window approach)
half_point = len(client['streaming_buffer']) // 2
client['streaming_buffer'] = client['streaming_buffer'][half_point:]
except Exception as e:
import traceback
traceback.print_exc()
print(f"Error processing streaming audio: {str(e)}")
await websocket.send_json({
"type": "error",
"message": f"Error processing streaming audio: {str(e)}"
emit('error', {
'type': 'error',
'message': f"Error processing streaming audio: {str(e)}"
})
elif action == "stop_streaming":
is_streaming = False
if streaming_buffer and len(streaming_buffer) > 5: # Only process if there's meaningful audio
@socketio.on('stop_streaming')
def handle_stop_streaming(data):
client_id = request.sid
if client_id not in active_clients:
return
client = active_clients[client_id]
client['is_streaming'] = False
if client['streaming_buffer'] and len(client['streaming_buffer']) > 5:
# Process any remaining audio in the buffer
full_audio = torch.cat(streaming_buffer, dim=0)
full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process with WhisperX speech-to-text
transcribed_text = await transcribe_audio(full_audio)
transcribed_text = transcribe_audio(full_audio)
if transcribed_text:
context_segments.append(Segment(text=transcribed_text, speaker=request.get("speaker", 0), audio=full_audio))
client['context_segments'].append(
Segment(text=transcribed_text, speaker=data.get("speaker", 0), audio=full_audio)
)
# Send the transcribed text to client
await websocket.send_json({
"type": "transcription",
"text": transcribed_text
emit('transcription', {
'type': 'transcription',
'text': transcribed_text
})
streaming_buffer = []
await websocket.send_json({
"type": "streaming_status",
"status": "stopped"
client['streaming_buffer'] = []
emit('streaming_status', {
'type': 'streaming_status',
'status': 'stopped'
})
except WebSocketDisconnect:
manager.disconnect(websocket)
print("Client disconnected")
except Exception as e:
print(f"Error: {str(e)}")
def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=500):
"""Stream audio to client in chunks to simulate real-time generation"""
try:
await websocket.send_json({
"type": "error",
"message": str(e)
})
except:
pass
manager.disconnect(websocket)
if client_id not in active_clients:
print(f"Client {client_id} not found for streaming")
return
# Calculate chunk size in samples
chunk_size = int(generator.sample_rate * chunk_size_ms / 1000)
total_chunks = math.ceil(audio_tensor.size(0) / chunk_size)
print(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each")
# Send initial response with text but no audio yet
socketio.emit('audio_response_start', {
'type': 'audio_response_start',
'text': text,
'total_chunks': total_chunks
}, room=client_id)
# Stream each chunk
for i in range(total_chunks):
start_idx = i * chunk_size
end_idx = min(start_idx + chunk_size, audio_tensor.size(0))
# Extract chunk
chunk = audio_tensor[start_idx:end_idx]
# Encode chunk
chunk_base64 = encode_audio_data(chunk)
# Send chunk
socketio.emit('audio_response_chunk', {
'type': 'audio_response_chunk',
'chunk_index': i,
'total_chunks': total_chunks,
'audio': chunk_base64,
'is_last': i == total_chunks - 1
}, room=client_id)
# Brief pause between chunks to simulate streaming
time.sleep(0.1)
# Send completion message
socketio.emit('audio_response_complete', {
'type': 'audio_response_complete',
'text': text
}, room=client_id)
print(f"Audio streaming complete: {total_chunks} chunks sent")
except Exception as e:
print(f"Error streaming audio to client: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
print(f"\n{'='*60}")
print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
print(f"{'='*60}")
print(f"📡 Server Information:")
print(f" - Local URL: http://localhost:5000")
print(f" - Network URL: http://<your-ip-address>:5000")
print(f" - WebSocket: ws://<your-ip-address>:5000/socket.io")
print(f"{'='*60}")
print(f"💡 To make this server public:")
print(f" 1. Ensure port 5000 is open in your firewall")
print(f" 2. Set up port forwarding on your router to port 5000")
print(f" 3. Or use a service like ngrok with: ngrok http 5000")
print(f"{'='*60}")
print(f"🌐 Device: {device.upper()}")
print(f"🧠 Models loaded: Sesame CSM + WhisperX ({asr_model.device})")
print(f"🔧 Serving from: {os.path.join(base_dir, 'index.html')}")
print(f"{'='*60}")
print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n")
socketio.run(app, host="0.0.0.0", port=5000, debug=False)

852
Backend/voice-chat.js Normal file
View File

@@ -0,0 +1,852 @@
/**
* Sesame AI Voice Chat Client
*
* A web client that connects to a Sesame AI voice chat server and enables
* real-time voice conversation with an AI assistant.
*/
// Configuration constants
const SERVER_URL = window.location.hostname === 'localhost' ?
'http://localhost:5000' : window.location.origin;
const ENERGY_WINDOW_SIZE = 15;
const CLIENT_SILENCE_DURATION_MS = 750;
// DOM elements
const elements = {
conversation: null,
streamButton: null,
clearButton: null,
thresholdSlider: null,
thresholdValue: null,
visualizerCanvas: null,
visualizerLabel: null,
volumeLevel: null,
statusDot: null,
statusText: null,
speakerSelection: null,
autoPlayResponses: null,
showVisualizer: null
};
// Application state
const state = {
socket: null,
audioContext: null,
analyser: null,
microphone: null,
streamProcessor: null,
isStreaming: false,
isSpeaking: false,
silenceThreshold: 0.01,
energyWindow: [],
silenceTimer: null,
volumeUpdateInterval: null,
visualizerAnimationFrame: null,
currentSpeaker: 0
};
// Visualizer variables
let canvasContext = null;
let visualizerBufferLength = 0;
let visualizerDataArray = null;
// Initialize the application
function initializeApp() {
// Initialize the UI elements
initializeUIElements();
// Initialize socket.io connection
setupSocketConnection();
// Setup event listeners
setupEventListeners();
// Initialize visualizer
setupVisualizer();
// Show welcome message
addSystemMessage('Welcome to Sesame AI Voice Chat! Click "Start Conversation" to begin.');
}
// Initialize UI elements
function initializeUIElements() {
// Store references to UI elements
elements.conversation = document.getElementById('conversation');
elements.streamButton = document.getElementById('streamButton');
elements.clearButton = document.getElementById('clearButton');
elements.thresholdSlider = document.getElementById('thresholdSlider');
elements.thresholdValue = document.getElementById('thresholdValue');
elements.visualizerCanvas = document.getElementById('audioVisualizer');
elements.visualizerLabel = document.getElementById('visualizerLabel');
elements.volumeLevel = document.getElementById('volumeLevel');
elements.statusDot = document.getElementById('statusDot');
elements.statusText = document.getElementById('statusText');
elements.speakerSelection = document.getElementById('speakerSelect'); // Changed to match HTML
elements.autoPlayResponses = document.getElementById('autoPlayResponses');
elements.showVisualizer = document.getElementById('showVisualizer');
}
// Setup Socket.IO connection
function setupSocketConnection() {
state.socket = io(SERVER_URL);
// Connection events
state.socket.on('connect', () => {
console.log('Connected to server');
updateConnectionStatus(true);
});
state.socket.on('disconnect', () => {
console.log('Disconnected from server');
updateConnectionStatus(false);
// Stop streaming if active
if (state.isStreaming) {
stopStreaming(false);
}
});
state.socket.on('error', (data) => {
console.error('Socket error:', data.message);
addSystemMessage(`Error: ${data.message}`);
});
// Register message handlers
state.socket.on('audio_response', handleAudioResponse);
state.socket.on('transcription', handleTranscription);
state.socket.on('context_updated', handleContextUpdate);
state.socket.on('streaming_status', handleStreamingStatus);
}
// Setup event listeners
function setupEventListeners() {
// Stream button
elements.streamButton.addEventListener('click', toggleStreaming);
// Clear button
elements.clearButton.addEventListener('click', clearConversation);
// Threshold slider
elements.thresholdSlider.addEventListener('input', updateThreshold);
// Speaker selection
elements.speakerSelection.addEventListener('change', () => {
state.currentSpeaker = parseInt(elements.speakerSelection.value, 10);
});
// Visualizer toggle
elements.showVisualizer.addEventListener('change', toggleVisualizerVisibility);
}
// Setup audio visualizer
function setupVisualizer() {
if (!elements.visualizerCanvas) return;
canvasContext = elements.visualizerCanvas.getContext('2d');
// Set canvas dimensions
elements.visualizerCanvas.width = elements.visualizerCanvas.offsetWidth;
elements.visualizerCanvas.height = elements.visualizerCanvas.offsetHeight;
// Initialize the visualizer
drawVisualizer();
}
// Update connection status UI
function updateConnectionStatus(isConnected) {
elements.statusDot.classList.toggle('active', isConnected);
elements.statusText.textContent = isConnected ? 'Connected' : 'Disconnected';
}
// Toggle streaming state
function toggleStreaming() {
if (state.isStreaming) {
stopStreaming(true);
} else {
startStreaming();
}
}
// Start streaming audio to the server
function startStreaming() {
if (state.isStreaming) return;
// Request microphone access
navigator.mediaDevices.getUserMedia({ audio: true, video: false })
.then(stream => {
// Show processing state while setting up
elements.streamButton.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Initializing...';
// Create audio context
state.audioContext = new (window.AudioContext || window.webkitAudioContext)();
// Create microphone source
state.microphone = state.audioContext.createMediaStreamSource(stream);
// Create analyser for visualizer
state.analyser = state.audioContext.createAnalyser();
state.analyser.fftSize = 256;
visualizerBufferLength = state.analyser.frequencyBinCount;
visualizerDataArray = new Uint8Array(visualizerBufferLength);
// Connect microphone to analyser
state.microphone.connect(state.analyser);
// Create script processor for audio processing
const bufferSize = 4096;
state.streamProcessor = state.audioContext.createScriptProcessor(bufferSize, 1, 1);
// Set up audio processing callback
state.streamProcessor.onaudioprocess = handleAudioProcess;
// Connect the processors
state.analyser.connect(state.streamProcessor);
state.streamProcessor.connect(state.audioContext.destination);
// Update UI
state.isStreaming = true;
elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
elements.streamButton.classList.add('recording');
// Initialize energy window
state.energyWindow = [];
// Start volume meter updates
state.volumeUpdateInterval = setInterval(updateVolumeMeter, 100);
// Start visualizer if enabled
if (elements.showVisualizer.checked && !state.visualizerAnimationFrame) {
drawVisualizer();
}
// Show starting message
addSystemMessage('Listening... Speak clearly into your microphone.');
// Notify the server that we're starting
state.socket.emit('stream_audio', {
audio: '',
speaker: state.currentSpeaker
});
})
.catch(err => {
console.error('Error accessing microphone:', err);
addSystemMessage(`Error: ${err.message}. Please make sure your microphone is connected and you've granted permission.`);
elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
});
}
// Stop streaming audio
function stopStreaming(notifyServer = true) {
if (!state.isStreaming) return;
// Update UI first
elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Start Conversation';
elements.streamButton.classList.remove('recording');
elements.streamButton.classList.remove('processing');
// Stop volume meter updates
if (state.volumeUpdateInterval) {
clearInterval(state.volumeUpdateInterval);
state.volumeUpdateInterval = null;
}
// Stop all audio processing
if (state.streamProcessor) {
state.streamProcessor.disconnect();
state.streamProcessor = null;
}
if (state.analyser) {
state.analyser.disconnect();
}
if (state.microphone) {
state.microphone.disconnect();
}
// Close audio context
if (state.audioContext && state.audioContext.state !== 'closed') {
state.audioContext.close().catch(err => console.warn('Error closing audio context:', err));
}
// Cleanup animation frames
if (state.visualizerAnimationFrame) {
cancelAnimationFrame(state.visualizerAnimationFrame);
state.visualizerAnimationFrame = null;
}
// Reset state
state.isStreaming = false;
state.isSpeaking = false;
// Notify the server
if (notifyServer && state.socket && state.socket.connected) {
state.socket.emit('stop_streaming', {
speaker: state.currentSpeaker
});
}
// Show message
addSystemMessage('Conversation paused. Click "Start Conversation" to resume.');
}
// Handle audio processing
function handleAudioProcess(event) {
const inputData = event.inputBuffer.getChannelData(0);
// Log audio buffer statistics
console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`);
// Calculate audio energy (volume level)
const energy = calculateAudioEnergy(inputData);
console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`);
// Update energy window for averaging
updateEnergyWindow(energy);
// Calculate average energy
const avgEnergy = calculateAverageEnergy();
// Determine if audio is silent
const isSilent = avgEnergy < state.silenceThreshold;
console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`);
// Handle speech state based on silence
handleSpeechState(isSilent);
// Only send audio chunk if we detect speech
if (!isSilent) {
// Create a resampled version at 24kHz for the server
// Most WebRTC audio is 48kHz, but we want 24kHz for the model
const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`);
// Send the audio chunk to the server
sendAudioChunk(resampledData, state.currentSpeaker);
}
}
// Cleanup audio resources when done
function cleanupAudioResources() {
// Stop all audio processing
if (state.streamProcessor) {
state.streamProcessor.disconnect();
state.streamProcessor = null;
}
if (state.analyser) {
state.analyser.disconnect();
state.analyser = null;
}
if (state.microphone) {
state.microphone.disconnect();
state.microphone = null;
}
// Close audio context
if (state.audioContext && state.audioContext.state !== 'closed') {
state.audioContext.close().catch(err => console.warn('Error closing audio context:', err));
}
// Cancel all timers and animation frames
if (state.volumeUpdateInterval) {
clearInterval(state.volumeUpdateInterval);
state.volumeUpdateInterval = null;
}
if (state.visualizerAnimationFrame) {
cancelAnimationFrame(state.visualizerAnimationFrame);
state.visualizerAnimationFrame = null;
}
if (state.silenceTimer) {
clearTimeout(state.silenceTimer);
state.silenceTimer = null;
}
}
// Clear conversation history
function clearConversation() {
if (elements.conversation) {
elements.conversation.innerHTML = '';
addSystemMessage('Conversation cleared.');
// Notify server to clear context
if (state.socket && state.socket.connected) {
state.socket.emit('clear_context');
}
}
}
// Calculate audio energy (volume)
function calculateAudioEnergy(buffer) {
let sum = 0;
for (let i = 0; i < buffer.length; i++) {
sum += buffer[i] * buffer[i];
}
return Math.sqrt(sum / buffer.length);
}
// Update energy window for averaging
function updateEnergyWindow(energy) {
state.energyWindow.push(energy);
if (state.energyWindow.length > ENERGY_WINDOW_SIZE) {
state.energyWindow.shift();
}
}
// Calculate average energy from window
function calculateAverageEnergy() {
if (state.energyWindow.length === 0) return 0;
const sum = state.energyWindow.reduce((a, b) => a + b, 0);
return sum / state.energyWindow.length;
}
// Update the threshold from the slider
function updateThreshold() {
state.silenceThreshold = parseFloat(elements.thresholdSlider.value);
elements.thresholdValue.textContent = state.silenceThreshold.toFixed(3);
}
// Update the volume meter display
function updateVolumeMeter() {
if (!state.isStreaming || !state.energyWindow.length) return;
const avgEnergy = calculateAverageEnergy();
// Scale energy to percentage (0-100)
// Typically, energy values will be very small (e.g., 0.001 to 0.1)
// So we multiply by a factor to make it more visible
const scaleFactor = 1000;
const percentage = Math.min(100, Math.max(0, avgEnergy * scaleFactor));
// Update volume meter width
elements.volumeLevel.style.width = `${percentage}%`;
// Change color based on level
if (percentage > 70) {
elements.volumeLevel.style.backgroundColor = '#ff5252';
} else if (percentage > 30) {
elements.volumeLevel.style.backgroundColor = '#4CAF50';
} else {
elements.volumeLevel.style.backgroundColor = '#4c84ff';
}
}
// Handle speech/silence state transitions
function handleSpeechState(isSilent) {
if (state.isSpeaking && isSilent) {
// Transition from speaking to silence
if (!state.silenceTimer) {
state.silenceTimer = setTimeout(() => {
// Only consider it a real silence after a certain duration
// This prevents detecting brief pauses as the end of speech
state.isSpeaking = false;
state.silenceTimer = null;
}, CLIENT_SILENCE_DURATION_MS);
}
} else if (state.silenceTimer && !isSilent) {
// User started speaking again, cancel the silence timer
clearTimeout(state.silenceTimer);
state.silenceTimer = null;
}
// Update speaking state for non-silent audio
if (!isSilent) {
state.isSpeaking = true;
}
}
// Send audio chunk to server
function sendAudioChunk(audioData, speaker) {
if (!state.socket || !state.socket.connected) {
console.warn('Socket not connected');
return;
}
console.log(`Preparing audio chunk: length=${audioData.length}, speaker=${speaker}`);
// Check for NaN or invalid values
let hasInvalidValues = false;
for (let i = 0; i < audioData.length; i++) {
if (isNaN(audioData[i]) || !isFinite(audioData[i])) {
hasInvalidValues = true;
console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`);
break;
}
}
if (hasInvalidValues) {
console.warn('Audio data contains invalid values. Creating silent audio.');
audioData = new Float32Array(audioData.length).fill(0);
}
try {
// Create WAV blob
const wavData = createWavBlob(audioData, 24000);
console.log(`WAV blob created: ${wavData.size} bytes`);
const reader = new FileReader();
reader.onloadend = function() {
try {
// Get base64 data
const base64data = reader.result;
console.log(`Base64 data created: ${base64data.length} bytes`);
// Send to server
state.socket.emit('stream_audio', {
audio: base64data,
speaker: speaker
});
console.log('Audio chunk sent to server');
} catch (err) {
console.error('Error preparing audio data:', err);
}
};
reader.onerror = function() {
console.error('Error reading audio data as base64');
};
reader.readAsDataURL(wavData);
} catch (err) {
console.error('Error creating WAV data:', err);
}
}
// Create WAV blob from audio data with improved error handling
function createWavBlob(audioData, sampleRate) {
// Validate input
if (!audioData || audioData.length === 0) {
console.warn('Empty audio data provided to createWavBlob');
audioData = new Float32Array(1024).fill(0); // Create 1024 samples of silence
}
// Function to convert Float32Array to Int16Array for WAV format
function floatTo16BitPCM(output, offset, input) {
for (let i = 0; i < input.length; i++, offset += 2) {
// Ensure values are in -1 to 1 range
const s = Math.max(-1, Math.min(1, input[i]));
// Convert to 16-bit PCM
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
}
}
// Create WAV header
function writeString(view, offset, string) {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
}
try {
// Create WAV file with header - careful with buffer sizes
const buffer = new ArrayBuffer(44 + audioData.length * 2);
const view = new DataView(buffer);
// RIFF identifier
writeString(view, 0, 'RIFF');
// File length (will be filled later)
view.setUint32(4, 36 + audioData.length * 2, true);
// WAVE identifier
writeString(view, 8, 'WAVE');
// fmt chunk identifier
writeString(view, 12, 'fmt ');
// fmt chunk length
view.setUint32(16, 16, true);
// Sample format (1 is PCM)
view.setUint16(20, 1, true);
// Mono channel
view.setUint16(22, 1, true);
// Sample rate
view.setUint32(24, sampleRate, true);
// Byte rate (sample rate * block align)
view.setUint32(28, sampleRate * 2, true);
// Block align (channels * bytes per sample)
view.setUint16(32, 2, true);
// Bits per sample
view.setUint16(34, 16, true);
// data chunk identifier
writeString(view, 36, 'data');
// data chunk length
view.setUint32(40, audioData.length * 2, true);
// Write the PCM samples
floatTo16BitPCM(view, 44, audioData);
// Create and return blob
return new Blob([view], { type: 'audio/wav' });
} catch (err) {
console.error('Error in createWavBlob:', err);
// Create a minimal valid WAV file with silence as fallback
const fallbackSamples = new Float32Array(1024).fill(0);
const fallbackBuffer = new ArrayBuffer(44 + fallbackSamples.length * 2);
const fallbackView = new DataView(fallbackBuffer);
writeString(fallbackView, 0, 'RIFF');
fallbackView.setUint32(4, 36 + fallbackSamples.length * 2, true);
writeString(fallbackView, 8, 'WAVE');
writeString(fallbackView, 12, 'fmt ');
fallbackView.setUint32(16, 16, true);
fallbackView.setUint16(20, 1, true);
fallbackView.setUint16(22, 1, true);
fallbackView.setUint32(24, sampleRate, true);
fallbackView.setUint32(28, sampleRate * 2, true);
fallbackView.setUint16(32, 2, true);
fallbackView.setUint16(34, 16, true);
writeString(fallbackView, 36, 'data');
fallbackView.setUint32(40, fallbackSamples.length * 2, true);
floatTo16BitPCM(fallbackView, 44, fallbackSamples);
return new Blob([fallbackView], { type: 'audio/wav' });
}
}
// Draw audio visualizer
function drawVisualizer() {
if (!canvasContext) {
return;
}
state.visualizerAnimationFrame = requestAnimationFrame(drawVisualizer);
// Skip drawing if visualizer is hidden
if (!elements.showVisualizer.checked) {
if (elements.visualizerCanvas.style.opacity !== '0') {
elements.visualizerCanvas.style.opacity = '0';
}
return;
} else if (elements.visualizerCanvas.style.opacity !== '1') {
elements.visualizerCanvas.style.opacity = '1';
}
// Get frequency data if available
if (state.isStreaming && state.analyser) {
try {
state.analyser.getByteFrequencyData(visualizerDataArray);
} catch (e) {
console.warn('Error getting frequency data:', e);
}
} else {
// Fade out when not streaming
for (let i = 0; i < visualizerDataArray.length; i++) {
visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5);
}
}
// Clear canvas
canvasContext.fillStyle = 'rgb(0, 0, 0)';
canvasContext.fillRect(0, 0, elements.visualizerCanvas.width, elements.visualizerCanvas.height);
// Draw gradient bars
const width = elements.visualizerCanvas.width;
const height = elements.visualizerCanvas.height;
const barCount = Math.min(visualizerBufferLength, 64);
const barWidth = width / barCount - 1;
for (let i = 0; i < barCount; i++) {
const index = Math.floor(i * visualizerBufferLength / barCount);
const value = visualizerDataArray[index];
// Use logarithmic scale for better audio visualization
// This makes low values more visible while still maintaining full range
const logFactor = 20;
const scaledValue = Math.log(1 + (value / 255) * logFactor) / Math.log(1 + logFactor);
const barHeight = scaledValue * height;
// Position bars
const x = i * (barWidth + 1);
const y = height - barHeight;
// Create color gradient based on frequency and amplitude
const hue = i / barCount * 360; // Full color spectrum
const saturation = 80 + (value / 255 * 20); // Higher values more saturated
const lightness = 40 + (value / 255 * 20); // Dynamic brightness based on amplitude
// Draw main bar
canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`;
canvasContext.fillRect(x, y, barWidth, barHeight);
// Add reflection effect
if (barHeight > 5) {
const gradient = canvasContext.createLinearGradient(
x, y,
x, y + barHeight * 0.5
);
gradient.addColorStop(0, `hsla(${hue}, ${saturation}%, ${lightness + 20}%, 0.4)`);
gradient.addColorStop(1, `hsla(${hue}, ${saturation}%, ${lightness}%, 0)`);
canvasContext.fillStyle = gradient;
canvasContext.fillRect(x, y, barWidth, barHeight * 0.5);
// Add highlight on top of the bar for better 3D effect
canvasContext.fillStyle = `hsla(${hue}, ${saturation - 20}%, ${lightness + 30}%, 0.7)`;
canvasContext.fillRect(x, y, barWidth, 2);
}
}
// Show/hide the label
elements.visualizerLabel.style.opacity = (state.isStreaming) ? '0' : '0.7';
}
// Toggle visualizer visibility
function toggleVisualizerVisibility() {
const isVisible = elements.showVisualizer.checked;
elements.visualizerCanvas.style.opacity = isVisible ? '1' : '0';
if (isVisible && state.isStreaming && !state.visualizerAnimationFrame) {
drawVisualizer();
}
}
// Handle audio response from server
function handleAudioResponse(data) {
console.log('Received audio response');
// Create message container
const messageElement = document.createElement('div');
messageElement.className = 'message ai';
// Add text content if available
if (data.text) {
const textElement = document.createElement('p');
textElement.textContent = data.text;
messageElement.appendChild(textElement);
}
// Create and configure audio element
const audioElement = document.createElement('audio');
audioElement.controls = true;
audioElement.className = 'audio-player';
// Set audio source
const audioSource = document.createElement('source');
audioSource.src = data.audio;
audioSource.type = 'audio/wav';
// Add fallback text
audioElement.textContent = 'Your browser does not support the audio element.';
// Assemble audio element
audioElement.appendChild(audioSource);
messageElement.appendChild(audioElement);
// Add timestamp
const timeElement = document.createElement('span');
timeElement.className = 'message-time';
timeElement.textContent = new Date().toLocaleTimeString();
messageElement.appendChild(timeElement);
// Add to conversation
elements.conversation.appendChild(messageElement);
// Auto-scroll to bottom
elements.conversation.scrollTop = elements.conversation.scrollHeight;
// Auto-play if enabled
if (elements.autoPlayResponses.checked) {
audioElement.play()
.catch(err => {
console.warn('Auto-play failed:', err);
addSystemMessage('Auto-play failed. Please click play to hear the response.');
});
}
// Re-enable stream button after processing is complete
if (state.isStreaming) {
elements.streamButton.innerHTML = '<i class="fas fa-microphone"></i> Listening...';
elements.streamButton.classList.add('recording');
elements.streamButton.classList.remove('processing');
}
}
// Handle transcription response from server
function handleTranscription(data) {
console.log('Received transcription:', data.text);
// Create message element
const messageElement = document.createElement('div');
messageElement.className = 'message user';
// Add text content
const textElement = document.createElement('p');
textElement.textContent = data.text;
messageElement.appendChild(textElement);
// Add timestamp
const timeElement = document.createElement('span');
timeElement.className = 'message-time';
timeElement.textContent = new Date().toLocaleTimeString();
messageElement.appendChild(timeElement);
// Add to conversation
elements.conversation.appendChild(messageElement);
// Auto-scroll to bottom
elements.conversation.scrollTop = elements.conversation.scrollHeight;
}
// Handle context update from server
function handleContextUpdate(data) {
console.log('Context updated:', data.message);
}
// Handle streaming status updates from server
function handleStreamingStatus(data) {
console.log('Streaming status:', data.status);
if (data.status === 'stopped') {
// Reset UI if needed
if (state.isStreaming) {
stopStreaming(false); // Don't send to server since this came from server
}
}
}
// Add a system message to the conversation
function addSystemMessage(message) {
const messageElement = document.createElement('div');
messageElement.className = 'message system';
messageElement.textContent = message;
elements.conversation.appendChild(messageElement);
// Auto-scroll to bottom
elements.conversation.scrollTop = elements.conversation.scrollHeight;
}
// Downsample audio buffer to target sample rate
function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
if (originalSampleRate === targetSampleRate) {
return buffer;
}
const ratio = originalSampleRate / targetSampleRate;
const newLength = Math.round(buffer.length / ratio);
const result = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const pos = Math.round(i * ratio);
result[i] = buffer[pos];
}
return result;
}
// Initialize the application when DOM is fully loaded
document.addEventListener('DOMContentLoaded', initializeApp);