From fdb92ff0613a65ff8d6a4d180ad317a51a1f81f9 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 03:03:14 -0400 Subject: [PATCH 1/3] Demo Fixes 6 --- Backend/server.py | 67 ++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index c4d38e0..563534c 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -112,6 +112,15 @@ def load_models(): torch_dtype=torch.bfloat16 ) models.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") + + # Configure all special tokens + models.tokenizer.pad_token = models.tokenizer.eos_token + models.tokenizer.padding_side = "left" # For causal language modeling + + # Inform the model about the pad token + if hasattr(models.llm.config, "pad_token_id") and models.llm.config.pad_token_id is None: + models.llm.config.pad_token_id = models.tokenizer.pad_token_id + logger.info("Llama 3.2 model loaded successfully") socketio.emit('model_status', {'model': 'llm', 'status': 'loaded'}) progress = 100 @@ -392,31 +401,41 @@ def process_audio_and_respond(session_id, data): prompt = f"{conversation_history}Assistant: " # Generate response with Llama - input_tokens = models.tokenizer( - prompt, - return_tensors="pt", - padding=True, - return_attention_mask=True - ) - input_ids = input_tokens.input_ids.to(DEVICE) - attention_mask = input_tokens.attention_mask.to(DEVICE) - - with torch.no_grad(): - generated_ids = models.llm.generate( - input_ids, - attention_mask=attention_mask, - max_new_tokens=100, - temperature=0.7, - top_p=0.9, - do_sample=True, - pad_token_id=models.tokenizer.eos_token_id + try: + # Ensure pad token is set + if models.tokenizer.pad_token is None: + models.tokenizer.pad_token = models.tokenizer.eos_token + + input_tokens = models.tokenizer( + prompt, + return_tensors="pt", + padding=True, + return_attention_mask=True ) - - # Decode the response - response_text = models.tokenizer.decode( - generated_ids[0][input_ids.shape[1]:], - skip_special_tokens=True - ).strip() + input_ids = input_tokens.input_ids.to(DEVICE) + attention_mask = input_tokens.attention_mask.to(DEVICE) + + with torch.no_grad(): + generated_ids = models.llm.generate( + input_ids, + attention_mask=attention_mask, + max_new_tokens=100, + temperature=0.7, + top_p=0.9, + do_sample=True, + pad_token_id=models.tokenizer.eos_token_id + ) + + # Decode the response + response_text = models.tokenizer.decode( + generated_ids[0][input_ids.shape[1]:], + skip_special_tokens=True + ).strip() + except Exception as e: + logger.error(f"Error generating response: {str(e)}") + import traceback + logger.error(traceback.format_exc()) + response_text = "I'm sorry, I encountered an error while processing your request." # Synthesize speech with app.app_context(): From 284dd509727b082396404e1d24443d8581195420 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 03:09:57 -0400 Subject: [PATCH 2/3] Demo Fixes 7 --- Backend/server.py | 89 ++++++++++++++++++++++++++++++++++--------- Backend/voice-chat.js | 14 ++++++- 2 files changed, 83 insertions(+), 20 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index 563534c..8145ab0 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -25,6 +25,10 @@ import whisperx from generator import load_csm_1b, Segment from dataclasses import dataclass +# Add these imports at the top +import psutil +import gc + # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') @@ -68,13 +72,13 @@ def load_models(): socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 0}) - logger.info("Loading CSM 1B model...") + # CSM 1B loading try: + socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 10, 'message': 'Loading CSM voice model'}) models.generator = load_csm_1b(device=DEVICE) logger.info("CSM 1B model loaded successfully") socketio.emit('model_status', {'model': 'csm', 'status': 'loaded'}) - progress = 33 - socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': progress}) + socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 33}) if DEVICE == "cuda": torch.cuda.empty_cache() except Exception as e: @@ -83,8 +87,9 @@ def load_models(): logger.error(f"Error loading CSM 1B model: {str(e)}\n{error_details}") socketio.emit('model_status', {'model': 'csm', 'status': 'error', 'message': str(e)}) - logger.info("Loading Whisper ASR model...") + # Whisper loading try: + socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 40, 'message': 'Loading speech recognition model'}) # Use regular Whisper instead of WhisperX to avoid compatibility issues from transformers import WhisperProcessor, WhisperForConditionalGeneration @@ -96,16 +101,16 @@ def load_models(): logger.info("Whisper ASR model loaded successfully") socketio.emit('model_status', {'model': 'asr', 'status': 'loaded'}) - progress = 66 - socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': progress}) + socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 66}) if DEVICE == "cuda": torch.cuda.empty_cache() except Exception as e: logger.error(f"Error loading ASR model: {str(e)}") socketio.emit('model_status', {'model': 'asr', 'status': 'error', 'message': str(e)}) - logger.info("Loading Llama 3.2 model...") + # Llama loading try: + socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 70, 'message': 'Loading language model'}) models.llm = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.2-1B", device_map=DEVICE, @@ -123,8 +128,8 @@ def load_models(): logger.info("Llama 3.2 model loaded successfully") socketio.emit('model_status', {'model': 'llm', 'status': 'loaded'}) - progress = 100 - socketio.emit('model_status', {'model': 'overall', 'status': 'loaded', 'progress': progress}) + socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 100, 'message': 'All models loaded successfully'}) + socketio.emit('model_status', {'model': 'overall', 'status': 'loaded'}) except Exception as e: logger.error(f"Error loading Llama 3.2 model: {str(e)}") socketio.emit('model_status', {'model': 'llm', 'status': 'error', 'message': str(e)}) @@ -184,6 +189,39 @@ def system_status(): } }) +# Add a new endpoint to check system resources +@app.route('/api/system_resources') +def system_resources(): + # Get CPU usage + cpu_percent = psutil.cpu_percent(interval=0.1) + + # Get memory usage + memory = psutil.virtual_memory() + memory_used_gb = memory.used / (1024 ** 3) + memory_total_gb = memory.total / (1024 ** 3) + memory_percent = memory.percent + + # Get GPU memory if available + gpu_memory = {} + if torch.cuda.is_available(): + for i in range(torch.cuda.device_count()): + gpu_memory[f"gpu_{i}"] = { + "allocated": torch.cuda.memory_allocated(i) / (1024 ** 3), + "reserved": torch.cuda.memory_reserved(i) / (1024 ** 3), + "max_allocated": torch.cuda.max_memory_allocated(i) / (1024 ** 3) + } + + return jsonify({ + "cpu_percent": cpu_percent, + "memory": { + "used_gb": memory_used_gb, + "total_gb": memory_total_gb, + "percent": memory_percent + }, + "gpu_memory": gpu_memory, + "active_sessions": len(active_conversations) + }) + # Socket event handlers @socketio.on('connect') def handle_connect(auth=None): @@ -331,18 +369,33 @@ def process_audio_and_respond(session_id, data): speech_array, sampling_rate = librosa.load(temp_path, sr=16000) # Convert to required format - input_features = models.asr_processor( + processor_output = models.asr_processor( speech_array, sampling_rate=sampling_rate, - return_tensors="pt" - ).input_features.to(DEVICE) - - # Generate token ids - predicted_ids = models.asr_model.generate( - input_features, - language="en", - task="transcribe" + return_tensors="pt", + padding=True, # Add padding + return_attention_mask=True # Request attention mask ) + input_features = processor_output.input_features.to(DEVICE) + attention_mask = processor_output.get('attention_mask', None) + + if attention_mask is not None: + attention_mask = attention_mask.to(DEVICE) + + # Generate token ids with attention mask + predicted_ids = models.asr_model.generate( + input_features, + attention_mask=attention_mask, + language="en", + task="transcribe" + ) + else: + # Fallback if attention mask is not available + predicted_ids = models.asr_model.generate( + input_features, + language="en", + task="transcribe" + ) # Decode the predicted ids to text user_text = models.asr_processor.batch_decode( diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js index 2efd76d..dc2db04 100644 --- a/Backend/voice-chat.js +++ b/Backend/voice-chat.js @@ -43,7 +43,9 @@ const state = { volumeUpdateInterval: null, visualizerAnimationFrame: null, currentSpeaker: 0, - aiSpeakerId: 1 // Define the AI's speaker ID to match server.py + aiSpeakerId: 1, // Define the AI's speaker ID to match server.py + transcriptionRetries: 0, + maxTranscriptionRetries: 3 }; // Visualizer variables @@ -429,7 +431,15 @@ function handleSpeechState(isSilent) { if (!hasAudioContent) { console.warn('Audio buffer appears to be empty or very quiet'); - addSystemMessage('No speech detected. Please try again and speak clearly.'); + + if (state.transcriptionRetries < state.maxTranscriptionRetries) { + state.transcriptionRetries++; + const retryMessage = `No speech detected (attempt ${state.transcriptionRetries}/${state.maxTranscriptionRetries}). Please speak louder and try again.`; + addSystemMessage(retryMessage); + } else { + state.transcriptionRetries = 0; + addSystemMessage('Multiple attempts failed to detect speech. Please check your microphone and try again.'); + } return; } From c8551f90b361c8abe3e73392670a9a8259268b71 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 03:19:23 -0400 Subject: [PATCH 3/3] Demo Fixes 8 --- Backend/server.py | 131 ++++++++++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 50 deletions(-) diff --git a/Backend/server.py b/Backend/server.py index 8145ab0..e912a9d 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -60,8 +60,11 @@ class AppModels: generator = None tokenizer = None llm = None - asr_model = None - asr_processor = None + whisperx_model = None + whisperx_align_model = None + whisperx_align_metadata = None + diarize_model = None + last_language = None # Initialize the models object models = AppModels() @@ -87,25 +90,27 @@ def load_models(): logger.error(f"Error loading CSM 1B model: {str(e)}\n{error_details}") socketio.emit('model_status', {'model': 'csm', 'status': 'error', 'message': str(e)}) - # Whisper loading + # WhisperX loading try: socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 40, 'message': 'Loading speech recognition model'}) - # Use regular Whisper instead of WhisperX to avoid compatibility issues - from transformers import WhisperProcessor, WhisperForConditionalGeneration + # Use WhisperX for better transcription with timestamps + import whisperx - # Use a smaller model for faster processing - model_id = "openai/whisper-small" + # Use compute_type based on device + compute_type = "float16" if DEVICE == "cuda" else "float32" - models.asr_processor = WhisperProcessor.from_pretrained(model_id) - models.asr_model = WhisperForConditionalGeneration.from_pretrained(model_id).to(DEVICE) + # Load the WhisperX model (smaller model for faster processing) + models.whisperx_model = whisperx.load_model("small", DEVICE, compute_type=compute_type) - logger.info("Whisper ASR model loaded successfully") + logger.info("WhisperX model loaded successfully") socketio.emit('model_status', {'model': 'asr', 'status': 'loaded'}) socketio.emit('model_status', {'model': 'overall', 'status': 'loading', 'progress': 66}) if DEVICE == "cuda": torch.cuda.empty_cache() except Exception as e: - logger.error(f"Error loading ASR model: {str(e)}") + import traceback + error_details = traceback.format_exc() + logger.error(f"Error loading WhisperX model: {str(e)}\n{error_details}") socketio.emit('model_status', {'model': 'asr', 'status': 'error', 'message': str(e)}) # Llama loading @@ -184,7 +189,7 @@ def system_status(): "device": DEVICE, "models": { "generator": models.generator is not None, - "asr": models.asr_model is not None, # Use the correct model name + "asr": models.whisperx_model is not None, # Use the correct model name "llm": models.llm is not None } }) @@ -327,8 +332,8 @@ def process_audio_queue(session_id, q): del user_queues[session_id] def process_audio_and_respond(session_id, data): - """Process audio data and generate a response using standard Whisper""" - if models.generator is None or models.asr_model is None or models.llm is None: + """Process audio data and generate a response using WhisperX""" + if models.generator is None or models.whisperx_model is None or models.llm is None: logger.warning("Models not yet loaded!") with app.app_context(): socketio.emit('error', {'message': 'Models still loading, please wait'}, room=session_id) @@ -364,44 +369,69 @@ def process_audio_and_respond(session_id, data): with app.app_context(): socketio.emit('processing_status', {'status': 'transcribing'}, room=session_id) - # Load audio for ASR processing - import librosa - speech_array, sampling_rate = librosa.load(temp_path, sr=16000) + # Load audio using WhisperX + import whisperx + audio = whisperx.load_audio(temp_path) - # Convert to required format - processor_output = models.asr_processor( - speech_array, - sampling_rate=sampling_rate, - return_tensors="pt", - padding=True, # Add padding - return_attention_mask=True # Request attention mask - ) - input_features = processor_output.input_features.to(DEVICE) - attention_mask = processor_output.get('attention_mask', None) - - if attention_mask is not None: - attention_mask = attention_mask.to(DEVICE) + # Check audio length and add a warning for short clips + audio_length = len(audio) / 16000 # assuming 16kHz sample rate + if audio_length < 1.0: + logger.warning(f"Audio is very short ({audio_length:.2f}s), may affect transcription quality") + + # Transcribe using WhisperX + batch_size = 16 # adjust based on your GPU memory + logger.info("Running WhisperX transcription...") + + # Handle the warning about audio being shorter than 30s by suppressing it + import warnings + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="audio is shorter than 30s") + result = models.whisperx_model.transcribe(audio, batch_size=batch_size) + + # Get the detected language + language_code = result["language"] + logger.info(f"Detected language: {language_code}") + + # Check if alignment model needs to be loaded or updated + if models.whisperx_align_model is None or language_code != models.last_language: + # Clean up old models if they exist + if models.whisperx_align_model is not None: + del models.whisperx_align_model + del models.whisperx_align_metadata + if DEVICE == "cuda": + gc.collect() + torch.cuda.empty_cache() - # Generate token ids with attention mask - predicted_ids = models.asr_model.generate( - input_features, - attention_mask=attention_mask, - language="en", - task="transcribe" - ) - else: - # Fallback if attention mask is not available - predicted_ids = models.asr_model.generate( - input_features, - language="en", - task="transcribe" + # Load new alignment model for the detected language + logger.info(f"Loading alignment model for language: {language_code}") + models.whisperx_align_model, models.whisperx_align_metadata = whisperx.load_align_model( + language_code=language_code, device=DEVICE ) + models.last_language = language_code - # Decode the predicted ids to text - user_text = models.asr_processor.batch_decode( - predicted_ids, - skip_special_tokens=True - )[0] + # Align the transcript to get word-level timestamps + if result["segments"] and len(result["segments"]) > 0: + logger.info("Aligning transcript...") + result = whisperx.align( + result["segments"], + models.whisperx_align_model, + models.whisperx_align_metadata, + audio, + DEVICE, + return_char_alignments=False + ) + + # Process the segments for better output + for segment in result["segments"]: + # Round timestamps for better display + segment["start"] = round(segment["start"], 2) + segment["end"] = round(segment["end"], 2) + # Add a confidence score if not present + if "confidence" not in segment: + segment["confidence"] = 1.0 # Default confidence + + # Extract the full text from all segments + user_text = ' '.join([segment['text'] for segment in result['segments']]) # If no text was recognized, don't process further if not user_text or len(user_text.strip()) == 0: @@ -433,11 +463,12 @@ def process_audio_and_respond(session_id, data): audio=waveform.squeeze() ) - # Send transcription to client + # Send transcription to client with detailed segments with app.app_context(): socketio.emit('transcription', { 'text': user_text, - 'speaker': speaker_id + 'speaker': speaker_id, + 'segments': result['segments'] # Include the detailed segments with timestamps }, room=session_id) # Generate AI response using Llama