From a4f282fbcca0332ce5afcc1048ef3b91bc8d3467 Mon Sep 17 00:00:00 2001 From: GamerBoss101 Date: Sun, 30 Mar 2025 07:18:14 -0400 Subject: [PATCH] Demo Update 21 --- Backend/index.html | 346 ++++++++++++++++++++++++++++++++++++++------- Backend/req.txt | 1 + Backend/server.py | 256 +++++++++++++++++++++++++++++++++ 3 files changed, 552 insertions(+), 51 deletions(-) create mode 100644 Backend/req.txt create mode 100644 Backend/server.py diff --git a/Backend/index.html b/Backend/index.html index e1f5f94..359ed41 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -1,86 +1,225 @@ - - Audio Conversation Bot + Voice Assistant - CSM & Whisper -

Audio Conversation Bot

+

Voice Assistant with CSM & Whisper

+
-
Not connected
+ + + +
Connecting to server...
\ No newline at end of file diff --git a/Backend/req.txt b/Backend/req.txt new file mode 100644 index 0000000..a3edbdc --- /dev/null +++ b/Backend/req.txt @@ -0,0 +1 @@ +pip install faster-whisper \ No newline at end of file diff --git a/Backend/server.py b/Backend/server.py new file mode 100644 index 0000000..978b87c --- /dev/null +++ b/Backend/server.py @@ -0,0 +1,256 @@ +import os +import io +import base64 +import time +import torch +import torchaudio +import numpy as np +from flask import Flask, render_template, request +from flask_socketio import SocketIO, emit +from transformers import AutoModelForCausalLM, AutoTokenizer +from faster_whisper import WhisperModel +from generator import load_csm_1b, Segment +from collections import deque + +app = Flask(__name__) +app.config['SECRET_KEY'] = 'your-secret-key' +socketio = SocketIO(app, cors_allowed_origins="*") + +# Select the best available device +if torch.cuda.is_available(): + device = "cuda" + whisper_compute_type = "float16" +elif torch.backends.mps.is_available(): + device = "mps" + whisper_compute_type = "float32" +else: + device = "cpu" + whisper_compute_type = "int8" + +print(f"Using device: {device}") + +# Initialize Faster-Whisper for transcription +print("Loading Whisper model...") +whisper_model = WhisperModel("base", device=device, compute_type=whisper_compute_type) + +# Initialize CSM model for audio generation +print("Loading CSM model...") +csm_generator = load_csm_1b(device=device) + +# Initialize Llama 3.2 model for response generation +print("Loading Llama 3.2 model...") +llm_model_id = "meta-llama/Llama-3.2-1B" # Choose appropriate size based on resources +llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id) +llm_model = AutoModelForCausalLM.from_pretrained( + llm_model_id, + torch_dtype=torch.bfloat16, + device_map=device +) + +# Store conversation context +conversation_context = {} # session_id -> context + +@app.route('/') +def index(): + return render_template('index.html') + +@socketio.on('connect') +def handle_connect(): + print(f"Client connected: {request.sid}") + conversation_context[request.sid] = { + 'segments': [], + 'speakers': [0, 1], # 0 = user, 1 = bot + 'audio_buffer': deque(maxlen=10), # Store recent audio chunks + 'is_speaking': False, + 'silence_start': None + } + emit('ready', {'message': 'Connection established'}) + +@socketio.on('disconnect') +def handle_disconnect(): + print(f"Client disconnected: {request.sid}") + if request.sid in conversation_context: + del conversation_context[request.sid] + +@socketio.on('start_speaking') +def handle_start_speaking(): + if request.sid in conversation_context: + conversation_context[request.sid]['is_speaking'] = True + conversation_context[request.sid]['audio_buffer'].clear() + print(f"User {request.sid} started speaking") + +@socketio.on('audio_chunk') +def handle_audio_chunk(data): + if request.sid not in conversation_context: + return + + context = conversation_context[request.sid] + + # Decode audio data + audio_data = base64.b64decode(data['audio']) + audio_numpy = np.frombuffer(audio_data, dtype=np.float32) + audio_tensor = torch.tensor(audio_numpy) + + # Add to buffer + context['audio_buffer'].append(audio_tensor) + + # Check for silence to detect end of speech + if context['is_speaking'] and is_silence(audio_tensor): + if context['silence_start'] is None: + context['silence_start'] = time.time() + elif time.time() - context['silence_start'] > 1.0: # 1 second of silence + # Process the complete utterance + process_user_utterance(request.sid) + else: + context['silence_start'] = None + +@socketio.on('stop_speaking') +def handle_stop_speaking(): + if request.sid in conversation_context: + conversation_context[request.sid]['is_speaking'] = False + process_user_utterance(request.sid) + print(f"User {request.sid} stopped speaking") + +def is_silence(audio_tensor, threshold=0.02): + """Check if an audio chunk is silence based on amplitude threshold""" + return torch.mean(torch.abs(audio_tensor)) < threshold + +def process_user_utterance(session_id): + """Process completed user utterance, generate response and send audio back""" + context = conversation_context[session_id] + + if not context['audio_buffer']: + return + + # Combine audio chunks + full_audio = torch.cat(list(context['audio_buffer']), dim=0) + context['audio_buffer'].clear() + context['is_speaking'] = False + context['silence_start'] = None + + # Save audio to temporary WAV file for Whisper transcription + temp_audio_path = f"temp_audio_{session_id}.wav" + torchaudio.save( + temp_audio_path, + full_audio.unsqueeze(0), + 44100 # Assuming 44.1kHz from client + ) + + # Transcribe speech using Faster-Whisper + try: + segments, info = whisper_model.transcribe(temp_audio_path, beam_size=5) + + # Collect all text from segments + user_text = "" + for segment in segments: + segment_text = segment.text.strip() + print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment_text}") + user_text += segment_text + " " + + user_text = user_text.strip() + + # Cleanup temp file + if os.path.exists(temp_audio_path): + os.remove(temp_audio_path) + + if not user_text: + print("No speech detected.") + return + + print(f"Transcribed: {user_text}") + + # Add to conversation segments + user_segment = Segment( + text=user_text, + speaker=0, # User is speaker 0 + audio=full_audio + ) + context['segments'].append(user_segment) + + # Generate bot response + bot_response = generate_llm_response(user_text, context['segments']) + print(f"Bot response: {bot_response}") + + # Convert to audio using CSM + bot_audio = generate_audio_response(bot_response, context['segments']) + + # Convert audio to base64 for sending over websocket + audio_bytes = io.BytesIO() + torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav") + audio_bytes.seek(0) + audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') + + # Add bot response to conversation history + bot_segment = Segment( + text=bot_response, + speaker=1, # Bot is speaker 1 + audio=bot_audio + ) + context['segments'].append(bot_segment) + + # Send transcribed text to client + emit('transcription', {'text': user_text}, room=session_id) + + # Send audio response to client + emit('audio_response', { + 'audio': audio_b64, + 'text': bot_response + }, room=session_id) + + except Exception as e: + print(f"Error processing speech: {e}") + emit('error', {'message': f'Error processing speech: {str(e)}'}, room=session_id) + # Cleanup temp file in case of error + if os.path.exists(temp_audio_path): + os.remove(temp_audio_path) + +def generate_llm_response(user_text, conversation_segments): + """Generate text response using Llama 3.2""" + # Format conversation history for the LLM + conversation_history = "" + for segment in conversation_segments[-5:]: # Use last 5 utterances for context + speaker_name = "User" if segment.speaker == 0 else "Assistant" + conversation_history += f"{speaker_name}: {segment.text}\n" + + # Add the current user query + conversation_history += f"User: {user_text}\nAssistant:" + + # Generate response + inputs = llm_tokenizer(conversation_history, return_tensors="pt").to(device) + output = llm_model.generate( + inputs.input_ids, + max_new_tokens=150, + temperature=0.7, + top_p=0.9, + do_sample=True + ) + + response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + return response.strip() + +def generate_audio_response(text, conversation_segments): + """Generate audio response using CSM""" + # Use the last few conversation segments as context + context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments + + # Generate audio for bot response + audio = csm_generator.generate( + text=text, + speaker=1, # Bot is speaker 1 + context=context_segments, + max_audio_length_ms=10000, # 10 seconds max + temperature=0.9, + topk=50 + ) + + return audio + +if __name__ == '__main__': + # Ensure the existing index.html file is in the correct location + if not os.path.exists('templates'): + os.makedirs('templates') + + if os.path.exists('index.html') and not os.path.exists('templates/index.html'): + os.rename('index.html', 'templates/index.html') + + socketio.run(app, host='0.0.0.0', port=5000, debug=False) \ No newline at end of file