diff --git a/Backend/.gitignore b/Backend/.gitignore deleted file mode 100644 index 4b7fc9d..0000000 --- a/Backend/.gitignore +++ /dev/null @@ -1,46 +0,0 @@ -# Python -__pycache__/ -*.py[cod] -*$py.class -*.so -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# Virtual Environment -.env -.venv -env/ -venv/ -ENV/ - -# IDE -.idea/ -.vscode/ -*.swp -*.swo - -# Project specific -.python-version -*.wav -output_*/ -basic_audio.wav -full_conversation.wav -context_audio.wav - -# Model files -*.pt -*.ckpt \ No newline at end of file diff --git a/Backend/README.md b/Backend/README.md deleted file mode 100644 index 44cab4d..0000000 --- a/Backend/README.md +++ /dev/null @@ -1,154 +0,0 @@ -# CSM - -**2025/03/13** - We are releasing the 1B CSM variant. The checkpoint is [hosted on Hugging Face](https://huggingface.co/sesame/csm_1b). - ---- - -CSM (Conversational Speech Model) is a speech generation model from [Sesame](https://www.sesame.com) that generates RVQ audio codes from text and audio inputs. The model architecture employs a [Llama](https://www.llama.com/) backbone and a smaller audio decoder that produces [Mimi](https://huggingface.co/kyutai/mimi) audio codes. - -A fine-tuned variant of CSM powers the [interactive voice demo](https://www.sesame.com/voicedemo) shown in our [blog post](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). - -A hosted [Hugging Face space](https://huggingface.co/spaces/sesame/csm-1b) is also available for testing audio generation. - -## Requirements - -* A CUDA-compatible GPU -* The code has been tested on CUDA 12.4 and 12.6, but it may also work on other versions -* Similarly, Python 3.10 is recommended, but newer versions may be fine -* For some audio operations, `ffmpeg` may be required -* Access to the following Hugging Face models: - * [Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) - * [CSM-1B](https://huggingface.co/sesame/csm-1b) - -### Setup - -```bash -git clone git@github.com:SesameAILabs/csm.git -cd csm -python3.10 -m venv .venv -source .venv/bin/activate -pip install -r requirements.txt - -# Disable lazy compilation in Mimi -export NO_TORCH_COMPILE=1 - -# You will need access to CSM-1B and Llama-3.2-1B -huggingface-cli login -``` - -### Windows Setup - -The `triton` package cannot be installed in Windows. Instead use `pip install triton-windows`. - -## Quickstart - -This script will generate a conversation between 2 characters, using a prompt for each character. - -```bash -python run_csm.py -``` - -## Usage - -If you want to write your own applications with CSM, the following examples show basic usage. - -#### Generate a sentence - -This will use a random speaker identity, as no prompt or context is provided. - -```python -from generator import load_csm_1b -import torchaudio -import torch - -if torch.backends.mps.is_available(): - device = "mps" -elif torch.cuda.is_available(): - device = "cuda" -else: - device = "cpu" - -generator = load_csm_1b(device=device) - -audio = generator.generate( - text="Hello from Sesame.", - speaker=0, - context=[], - max_audio_length_ms=10_000, -) - -torchaudio.save("audio.wav", audio.unsqueeze(0).cpu(), generator.sample_rate) -``` - -#### Generate with context - -CSM sounds best when provided with context. You can prompt or provide context to the model using a `Segment` for each speaker's utterance. - -NOTE: The following example is instructional and the audio files do not exist. It is intended as an example for using context with CSM. - -```python -from generator import Segment - -speakers = [0, 1, 0, 0] -transcripts = [ - "Hey how are you doing.", - "Pretty good, pretty good.", - "I'm great.", - "So happy to be speaking to you.", -] -audio_paths = [ - "utterance_0.wav", - "utterance_1.wav", - "utterance_2.wav", - "utterance_3.wav", -] - -def load_audio(audio_path): - audio_tensor, sample_rate = torchaudio.load(audio_path) - audio_tensor = torchaudio.functional.resample( - audio_tensor.squeeze(0), orig_freq=sample_rate, new_freq=generator.sample_rate - ) - return audio_tensor - -segments = [ - Segment(text=transcript, speaker=speaker, audio=load_audio(audio_path)) - for transcript, speaker, audio_path in zip(transcripts, speakers, audio_paths) -] -audio = generator.generate( - text="Me too, this is some cool stuff huh?", - speaker=1, - context=segments, - max_audio_length_ms=10_000, -) - -torchaudio.save("audio.wav", audio.unsqueeze(0).cpu(), generator.sample_rate) -``` - -## FAQ - -**Does this model come with any voices?** - -The model open-sourced here is a base generation model. It is capable of producing a variety of voices, but it has not been fine-tuned on any specific voice. - -**Can I converse with the model?** - -CSM is trained to be an audio generation model and not a general-purpose multimodal LLM. It cannot generate text. We suggest using a separate LLM for text generation. - -**Does it support other languages?** - -The model has some capacity for non-English languages due to data contamination in the training data, but it likely won't do well. - -## Misuse and abuse ⚠️ - -This project provides a high-quality speech generation model for research and educational purposes. While we encourage responsible and ethical use, we **explicitly prohibit** the following: - -- **Impersonation or Fraud**: Do not use this model to generate speech that mimics real individuals without their explicit consent. -- **Misinformation or Deception**: Do not use this model to create deceptive or misleading content, such as fake news or fraudulent calls. -- **Illegal or Harmful Activities**: Do not use this model for any illegal, harmful, or malicious purposes. - -By using this model, you agree to comply with all applicable laws and ethical guidelines. We are **not responsible** for any misuse, and we strongly condemn unethical applications of this technology. - ---- - -## Authors -Johan Schalkwyk, Ankit Kumar, Dan Lyth, Sefik Emre Eskimez, Zack Hodari, Cinjon Resnick, Ramon Sanabria, Raven Jiang, and the Sesame team. diff --git a/Backend/app.py b/Backend/app.py new file mode 100644 index 0000000..091de8e --- /dev/null +++ b/Backend/app.py @@ -0,0 +1,229 @@ +import os +import io +import base64 +import time +import torch +import torchaudio +import numpy as np +from flask import Flask, render_template, request +from flask_socketio import SocketIO, emit +from transformers import AutoModelForCausalLM, AutoTokenizer +import speech_recognition as sr +from generator import load_csm_1b, Segment +from collections import deque + +app = Flask(__name__) +app.config['SECRET_KEY'] = 'your-secret-key' +socketio = SocketIO(app, cors_allowed_origins="*") + +# Select the best available device +if torch.cuda.is_available(): + device = "cuda" +elif torch.backends.mps.is_available(): + device = "mps" +else: + device = "cpu" +print(f"Using device: {device}") + +# Initialize CSM model for audio generation +print("Loading CSM model...") +csm_generator = load_csm_1b(device=device) + +# Initialize Llama 3.2 model for response generation +print("Loading Llama 3.2 model...") +llm_model_id = "meta-llama/Llama-3.2-1B" # Choose appropriate size based on resources +llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id) +llm_model = AutoModelForCausalLM.from_pretrained( + llm_model_id, + torch_dtype=torch.bfloat16, + device_map=device +) + +# Initialize speech recognition +recognizer = sr.Recognizer() + +# Store conversation context +conversation_context = {} # session_id -> context + +@app.route('/') +def index(): + return render_template('index.html') + +@socketio.on('connect') +def handle_connect(): + print(f"Client connected: {request.sid}") + conversation_context[request.sid] = { + 'segments': [], + 'speakers': [0, 1], # 0 = user, 1 = bot + 'audio_buffer': deque(maxlen=10), # Store recent audio chunks + 'is_speaking': False, + 'silence_start': None + } + emit('ready', {'message': 'Connection established'}) + +@socketio.on('disconnect') +def handle_disconnect(): + print(f"Client disconnected: {request.sid}") + if request.sid in conversation_context: + del conversation_context[request.sid] + +@socketio.on('start_speaking') +def handle_start_speaking(): + if request.sid in conversation_context: + conversation_context[request.sid]['is_speaking'] = True + conversation_context[request.sid]['audio_buffer'].clear() + print(f"User {request.sid} started speaking") + +@socketio.on('audio_chunk') +def handle_audio_chunk(data): + if request.sid not in conversation_context: + return + + context = conversation_context[request.sid] + + # Decode audio data + audio_data = base64.b64decode(data['audio']) + audio_numpy = np.frombuffer(audio_data, dtype=np.float32) + audio_tensor = torch.tensor(audio_numpy) + + # Add to buffer + context['audio_buffer'].append(audio_tensor) + + # Check for silence to detect end of speech + if context['is_speaking'] and is_silence(audio_tensor): + if context['silence_start'] is None: + context['silence_start'] = time.time() + elif time.time() - context['silence_start'] > 1.0: # 1 second of silence + # Process the complete utterance + process_user_utterance(request.sid) + else: + context['silence_start'] = None + +@socketio.on('stop_speaking') +def handle_stop_speaking(): + if request.sid in conversation_context: + conversation_context[request.sid]['is_speaking'] = False + process_user_utterance(request.sid) + print(f"User {request.sid} stopped speaking") + +def is_silence(audio_tensor, threshold=0.02): + """Check if an audio chunk is silence based on amplitude threshold""" + return torch.mean(torch.abs(audio_tensor)) < threshold + +def process_user_utterance(session_id): + """Process completed user utterance, generate response and send audio back""" + context = conversation_context[session_id] + + if not context['audio_buffer']: + return + + # Combine audio chunks + full_audio = torch.cat(list(context['audio_buffer']), dim=0) + context['audio_buffer'].clear() + context['is_speaking'] = False + context['silence_start'] = None + + # Convert audio to 16kHz for speech recognition + audio_16k = torchaudio.functional.resample( + full_audio, + orig_freq=44100, # Assuming 44.1kHz from client + new_freq=16000 + ) + + # Transcribe speech + try: + # Convert to wav format for speech_recognition + audio_data = io.BytesIO() + torchaudio.save(audio_data, audio_16k.unsqueeze(0), 16000, format="wav") + audio_data.seek(0) + + with sr.AudioFile(audio_data) as source: + audio = recognizer.record(source) + user_text = recognizer.recognize_google(audio) + print(f"Transcribed: {user_text}") + + # Add to conversation segments + user_segment = Segment( + text=user_text, + speaker=0, # User is speaker 0 + audio=full_audio + ) + context['segments'].append(user_segment) + + # Generate bot response + bot_response = generate_llm_response(user_text, context['segments']) + print(f"Bot response: {bot_response}") + + # Convert to audio using CSM + bot_audio = generate_audio_response(bot_response, context['segments']) + + # Convert audio to base64 for sending over websocket + audio_bytes = io.BytesIO() + torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav") + audio_bytes.seek(0) + audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') + + # Add bot response to conversation history + bot_segment = Segment( + text=bot_response, + speaker=1, # Bot is speaker 1 + audio=bot_audio + ) + context['segments'].append(bot_segment) + + # Send transcribed text to client + emit('transcription', {'text': user_text}, room=session_id) + + # Send audio response to client + emit('audio_response', { + 'audio': audio_b64, + 'text': bot_response + }, room=session_id) + + except Exception as e: + print(f"Error processing speech: {e}") + emit('error', {'message': f'Error processing speech: {str(e)}'}, room=session_id) + +def generate_llm_response(user_text, conversation_segments): + """Generate text response using Llama 3.2""" + # Format conversation history for the LLM + conversation_history = "" + for segment in conversation_segments[-5:]: # Use last 5 utterances for context + speaker_name = "User" if segment.speaker == 0 else "Assistant" + conversation_history += f"{speaker_name}: {segment.text}\n" + + # Add the current user query + conversation_history += f"User: {user_text}\nAssistant:" + + # Generate response + inputs = llm_tokenizer(conversation_history, return_tensors="pt").to(device) + output = llm_model.generate( + inputs.input_ids, + max_new_tokens=150, + temperature=0.7, + top_p=0.9, + do_sample=True + ) + + response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + return response.strip() + +def generate_audio_response(text, conversation_segments): + """Generate audio response using CSM""" + # Use the last few conversation segments as context + context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments + + # Generate audio for bot response + audio = csm_generator.generate( + text=text, + speaker=1, # Bot is speaker 1 + context=context_segments, + max_audio_length_ms=10000, # 10 seconds max + temperature=0.9, + topk=50 + ) + + return audio + +if __name__ == '__main__': + socketio.run(app, host='0.0.0.0', port=5000, debug=True) \ No newline at end of file diff --git a/Backend/index.html b/Backend/index.html index 5ea925c..359ed41 100644 --- a/Backend/index.html +++ b/Backend/index.html @@ -3,490 +3,454 @@ - Sesame AI Voice Chat - - + Voice Assistant - CSM & Whisper -
-

Sesame AI Voice Chat

-

Speak naturally and have a conversation with AI

-
- -
-
-
-

Conversation

- -
-
-
- -
-
-

Audio Visualizer

-
- -
Speak to see audio visualization
-
-
- -
-
-
Voice Settings
- - -
-
- Silence Threshold - 0.01 -
- -
- -
-
-
-
- -
-
Conversation Controls
-
- -
-
-
- -
-
Settings
-
-
- - - Auto-play responses -
-
- - - Show visualizer -
-
-
- -
-
-
Not connected
-
-
+

Voice Assistant with CSM & Whisper

+
+ +
+
+ + + +
Connecting to server...
- - - - + \ No newline at end of file diff --git a/Backend/req.txt b/Backend/req.txt new file mode 100644 index 0000000..a3edbdc --- /dev/null +++ b/Backend/req.txt @@ -0,0 +1 @@ +pip install faster-whisper \ No newline at end of file diff --git a/Backend/requirements.txt b/Backend/requirements.txt deleted file mode 100644 index ba8a04f..0000000 --- a/Backend/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -torch==2.4.0 -torchaudio==2.4.0 -tokenizers==0.21.0 -transformers==4.49.0 -huggingface_hub==0.28.1 -moshi==0.2.2 -torchtune==0.4.0 -torchao==0.9.0 -silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master \ No newline at end of file diff --git a/Backend/server.py b/Backend/server.py index 2cf721e..9e98d60 100644 --- a/Backend/server.py +++ b/Backend/server.py @@ -1,904 +1,388 @@ import os +import io import base64 -import json import time -import math -import gc -import logging -import numpy as np import torch import torchaudio -from io import BytesIO -from typing import List, Dict, Any, Optional -from flask import Flask, request, send_from_directory, Response -from flask_cors import CORS -from flask_socketio import SocketIO, emit, disconnect -from generator import load_csm_1b, Segment +import numpy as np +from flask import Flask, render_template, request +from flask_socketio import SocketIO, emit +from transformers import AutoModelForCausalLM, AutoTokenizer from collections import deque -from threading import Lock -from transformers import pipeline -from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline +import requests +import huggingface_hub +from generator import load_csm_1b, Segment -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger("sesame-server") +# Configure environment with longer timeouts +os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads +requests.adapters.DEFAULT_TIMEOUT = 60 # Increase default requests timeout -# Determine best compute device -if torch.backends.mps.is_available(): - device = "mps" -elif torch.cuda.is_available(): - try: - # Test CUDA functionality - torch.rand(10, device="cuda") - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.allow_tf32 = True - torch.backends.cudnn.benchmark = True - device = "cuda" - logger.info("CUDA is fully functional") - except Exception as e: - logger.warning(f"CUDA available but not working correctly: {e}") - device = "cpu" -else: - device = "cpu" - logger.info("Using CPU") +# Create a models directory for caching +os.makedirs("models", exist_ok=True) -# Constants and Configuration -SILENCE_THRESHOLD = 0.01 -SILENCE_DURATION_SEC = 0.75 -MAX_BUFFER_SIZE = 30 # Maximum chunks to buffer before processing -CHUNK_SIZE_MS = 500 # Size of audio chunks when streaming responses - -# Define the base directory and static files directory -base_dir = os.path.dirname(os.path.abspath(__file__)) -static_dir = os.path.join(base_dir, "static") -os.makedirs(static_dir, exist_ok=True) - -# Define a simple energy-based speech detector -class SpeechDetector: - def __init__(self): - self.min_speech_energy = 0.01 - self.speech_window = 0.2 # seconds - - def detect_speech(self, audio_tensor, sample_rate): - # Calculate frame size based on window size - frame_size = int(sample_rate * self.speech_window) - - # If audio is shorter than frame size, use the entire audio - if audio_tensor.shape[0] < frame_size: - frames = [audio_tensor] - else: - # Split audio into frames - frames = [audio_tensor[i:i+frame_size] for i in range(0, len(audio_tensor), frame_size)] - - # Calculate energy per frame - energies = [torch.mean(frame**2).item() for frame in frames] - - # Determine if there's speech based on energy threshold - has_speech = any(e > self.min_speech_energy for e in energies) - - return has_speech - -speech_detector = SpeechDetector() -logger.info("Initialized simple speech detector") - -# Model Loading Functions -def load_speech_models(): - """Load speech generation and recognition models""" - # Load CSM (existing code) - generator = load_csm_1b(device=device) - - # Load Whisper model for speech recognition - try: - logger.info(f"Loading speech recognition model on {device}...") - - # Try with newer API first - try: - from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline - - model_id = "openai/whisper-small" - - # Load model and processor - model = AutoModelForSpeechSeq2Seq.from_pretrained( - model_id, - torch_dtype=torch.float16 if device == "cuda" else torch.float32, - device_map=device, - ) - processor = AutoProcessor.from_pretrained(model_id) - - # Create pipeline with specific parameters - speech_recognizer = pipeline( - "automatic-speech-recognition", - model=model, - tokenizer=processor.tokenizer, - feature_extractor=processor.feature_extractor, - max_new_tokens=128, - chunk_length_s=30, - batch_size=16, - device=device, - ) - - except Exception as api_error: - logger.warning(f"Newer API loading failed: {api_error}, trying simpler approach") - - # Fallback to simpler API - speech_recognizer = pipeline( - "automatic-speech-recognition", - model="openai/whisper-small", - device=device - ) - - logger.info("Speech recognition model loaded successfully") - return generator, speech_recognizer - - except Exception as e: - logger.error(f"Error loading speech recognition model: {e}") - return generator, None - -# Unpack both models -generator, speech_recognizer = load_speech_models() - -# Initialize Llama 3.2 model for conversation responses -def load_llm_model(): - """Load Llama 3.2 model for generating text responses""" - try: - logger.info("Loading Llama 3.2 model for conversational responses...") - model_id = "meta-llama/Llama-3.2-1B-Instruct" - tokenizer = AutoTokenizer.from_pretrained(model_id) - - # Determine compute device for LLM - llm_device = "cpu" # Default to CPU for LLM - - # Use CUDA if available and there's enough VRAM - if device == "cuda" and torch.cuda.is_available(): - try: - free_mem = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0) - # If we have at least 2GB free, use CUDA for LLM - if free_mem > 2 * 1024 * 1024 * 1024: - llm_device = "cuda" - except: - pass - - logger.info(f"Using {llm_device} for Llama 3.2 model") - - # Load the model with lower precision for efficiency - model = AutoModelForCausalLM.from_pretrained( - model_id, - torch_dtype=torch.float16 if llm_device == "cuda" else torch.float32, - device_map=llm_device - ) - - # Create a pipeline for easier inference - llm = pipeline( - "text-generation", - model=model, - tokenizer=tokenizer, - max_length=512, - do_sample=True, - temperature=0.7, - top_p=0.9, - repetition_penalty=1.1 - ) - - logger.info("Llama 3.2 model loaded successfully") - return llm - except Exception as e: - logger.error(f"Error loading Llama 3.2 model: {e}") - return None - -# Load the LLM model -llm = load_llm_model() - -# Set up Flask and Socket.IO app = Flask(__name__) -CORS(app) -socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') +app.config['SECRET_KEY'] = 'your-secret-key' +socketio = SocketIO(app, cors_allowed_origins="*") -# Socket connection management -thread_lock = Lock() -active_clients = {} # Map client_id to client context - -# Audio Utility Functions -def decode_audio_data(audio_data: str) -> torch.Tensor: - """Decode base64 audio data to a torch tensor with improved error handling""" - try: - # Skip empty audio data - if not audio_data or len(audio_data) < 100: - logger.warning("Empty or too short audio data received") - return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence - - # Extract the actual base64 content - if ',' in audio_data: - audio_data = audio_data.split(',')[1] - - # Decode base64 audio data - try: - binary_data = base64.b64decode(audio_data) - logger.debug(f"Decoded base64 data: {len(binary_data)} bytes") - - # Check if we have enough data for a valid WAV - if len(binary_data) < 44: # WAV header is 44 bytes - logger.warning("Data too small to be a valid WAV file") - return torch.zeros(generator.sample_rate // 2) - except Exception as e: - logger.error(f"Base64 decoding error: {e}") - return torch.zeros(generator.sample_rate // 2) - - # Multiple approaches to handle audio data - audio_tensor = None - sample_rate = None - - # Approach 1: Direct loading with torchaudio - try: - with BytesIO(binary_data) as temp_file: - temp_file.seek(0) - audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") - logger.debug(f"Loaded audio: shape={audio_tensor.shape}, rate={sample_rate}Hz") - - # Validate tensor - if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any(): - raise ValueError("Invalid audio tensor") - except Exception as e: - logger.warning(f"Direct loading failed: {e}") - - # Approach 2: Using wave module and numpy - try: - temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav") - with open(temp_path, 'wb') as f: - f.write(binary_data) - - import wave - with wave.open(temp_path, 'rb') as wf: - n_channels = wf.getnchannels() - sample_width = wf.getsampwidth() - sample_rate = wf.getframerate() - n_frames = wf.getnframes() - frames = wf.readframes(n_frames) - - # Convert to numpy array - if sample_width == 2: # 16-bit audio - data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0 - elif sample_width == 1: # 8-bit audio - data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0 - else: - raise ValueError(f"Unsupported sample width: {sample_width}") - - # Convert to mono if needed - if n_channels > 1: - data = data.reshape(-1, n_channels) - data = data.mean(axis=1) - - # Convert to torch tensor - audio_tensor = torch.from_numpy(data) - logger.info(f"Loaded audio using wave: shape={audio_tensor.shape}") - - # Clean up temp file - if os.path.exists(temp_path): - os.remove(temp_path) - - except Exception as e2: - logger.error(f"All audio loading methods failed: {e2}") - return torch.zeros(generator.sample_rate // 2) - - # Format corrections - if audio_tensor is None: - return torch.zeros(generator.sample_rate // 2) - - # Ensure audio is mono - if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1: - audio_tensor = torch.mean(audio_tensor, dim=0) - - # Ensure 1D tensor - audio_tensor = audio_tensor.squeeze() - - # Resample if needed - if sample_rate != generator.sample_rate: - try: - logger.debug(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz") - resampler = torchaudio.transforms.Resample( - orig_freq=sample_rate, - new_freq=generator.sample_rate - ) - audio_tensor = resampler(audio_tensor) - except Exception as e: - logger.warning(f"Resampling error: {e}") - - # Normalize audio to avoid issues - if torch.abs(audio_tensor).max() > 0: - audio_tensor = audio_tensor / torch.abs(audio_tensor).max() - - return audio_tensor - except Exception as e: - logger.error(f"Unhandled error in decode_audio_data: {e}") - return torch.zeros(generator.sample_rate // 2) - -def encode_audio_data(audio_tensor: torch.Tensor) -> str: - """Encode torch tensor audio to base64 string""" - try: - buf = BytesIO() - torchaudio.save(buf, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate, format="wav") - buf.seek(0) - audio_base64 = base64.b64encode(buf.read()).decode('utf-8') - return f"data:audio/wav;base64,{audio_base64}" - except Exception as e: - logger.error(f"Error encoding audio: {e}") - # Return a minimal silent audio file - silence = torch.zeros(generator.sample_rate // 2).unsqueeze(0) - buf = BytesIO() - torchaudio.save(buf, silence, generator.sample_rate, format="wav") - buf.seek(0) - return f"data:audio/wav;base64,{base64.b64encode(buf.read()).decode('utf-8')}" - -def process_speech(audio_tensor: torch.Tensor, client_id: str) -> str: - """Process speech with speech recognition""" - if not speech_recognizer: - # Fallback to basic detection if model failed to load - return detect_speech_energy(audio_tensor) - - try: - # Save audio to temp file for Whisper - temp_path = os.path.join(base_dir, f"temp_{time.time()}.wav") - torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) - - # Perform speech recognition - handle the warning differently - # Just pass the path without any additional parameters - try: - # First try - use default parameters - result = speech_recognizer(temp_path) - transcription = result["text"] - except Exception as whisper_error: - logger.warning(f"First transcription attempt failed: {whisper_error}") - # Try with explicit parameters for older versions of transformers - import numpy as np - import soundfile as sf - - # Load audio as numpy array - audio_np, sr = sf.read(temp_path) - if sr != 16000: - # Whisper expects 16kHz audio - from scipy import signal - audio_np = signal.resample(audio_np, int(len(audio_np) * 16000 / sr)) - - # Try with numpy array directly - result = speech_recognizer(audio_np) - transcription = result["text"] - - # Clean up temp file - if os.path.exists(temp_path): - os.remove(temp_path) - - # Return empty string if no speech detected - if not transcription or transcription.isspace(): - return "I didn't detect any speech. Could you please try again?" - - logger.info(f"Transcription successful: '{transcription}'") - return transcription - - except Exception as e: - logger.error(f"Speech recognition error: {e}") - return "Sorry, I couldn't understand what you said. Could you try again?" - -def detect_speech_energy(audio_tensor: torch.Tensor) -> str: - """Basic speech detection based on audio energy levels""" - # Calculate audio energy - energy = torch.mean(torch.abs(audio_tensor)).item() - - logger.debug(f"Audio energy detected: {energy:.6f}") - - # Generate response based on energy level - if energy > 0.1: # Louder speech - return "I heard you speaking clearly. How can I help you today?" - elif energy > 0.05: # Moderate speech - return "I heard you say something. Could you please repeat that?" - elif energy > 0.02: # Soft speech - return "I detected some speech, but it was quite soft. Could you speak up a bit?" - else: # Very soft or no speech - return "I didn't detect any speech. Could you please try again?" - -def generate_response(text: str, conversation_history: List[Segment]) -> str: - """Generate a contextual response based on the transcribed text using Llama 3.2""" - # If LLM is not available, use simple responses - if llm is None: - return generate_simple_response(text) - - try: - # Create a conversational prompt based on history - # Format: recent conversation turns (up to 4) + current user input - history_str = "" - - # Add up to 4 recent conversation turns (excluding the current one) - recent_segments = [ - seg for seg in conversation_history[-8:] - if seg.text and not seg.text.isspace() - ] - - for i, segment in enumerate(recent_segments): - speaker_name = "User" if segment.speaker == 0 else "Assistant" - history_str += f"{speaker_name}: {segment.text}\n" - - # Construct the prompt for Llama 3.2 - prompt = f"""<|system|> -You are Sesame, a helpful, friendly and concise voice assistant. -Keep your responses conversational, natural, and to the point. -Respond to the user's latest message in the context of the conversation. -<|end|> - -{history_str} -User: {text} -Assistant:""" - - logger.debug(f"LLM Prompt: {prompt}") - - # Generate response with the LLM - result = llm( - prompt, - max_new_tokens=150, - do_sample=True, - temperature=0.7, - top_p=0.9, - repetition_penalty=1.1 - ) - - # Extract the generated text - response = result[0]["generated_text"] - - # Extract just the Assistant's response (after the prompt) - response = response.split("Assistant:")[-1].strip() - - # Clean up and ensure it's not too long for TTS - response = response.split("\n")[0].strip() - if len(response) > 200: - response = response[:197] + "..." - - logger.info(f"LLM response: {response}") - return response - - except Exception as e: - logger.error(f"Error generating LLM response: {e}") - # Fall back to simple responses - return generate_simple_response(text) - -def generate_simple_response(text: str) -> str: - """Generate a simple rule-based response as fallback""" - responses = { - "hello": "Hello there! How can I help you today?", - "hi": "Hi there! What can I do for you?", - "how are you": "I'm doing well, thanks for asking! How about you?", - "what is your name": "I'm Sesame, your voice assistant. How can I help you?", - "who are you": "I'm Sesame, an AI voice assistant. I'm here to chat with you!", - "bye": "Goodbye! It was nice chatting with you.", - "thank you": "You're welcome! Is there anything else I can help with?", - "weather": "I don't have real-time weather data, but I hope it's nice where you are!", - "help": "I can chat with you using natural voice. Just speak normally and I'll respond.", - "what can you do": "I can have a conversation with you, answer questions, and provide assistance with various topics.", - } - - text_lower = text.lower() - - # Check for matching keywords - for key, response in responses.items(): - if key in text_lower: - return response - - # Default responses based on text length - if not text: - return "I didn't catch that. Could you please repeat?" - elif len(text) < 10: - return "Thanks for your message. Could you elaborate a bit more?" +# Check for CUDA availability and handle potential CUDA/cuDNN issues +try: + cuda_available = torch.cuda.is_available() + # Try to initialize CUDA to check if libraries are properly loaded + if cuda_available: + _ = torch.zeros(1).cuda() + device = "cuda" + whisper_compute_type = "float16" + print("CUDA is available and initialized successfully") + elif torch.backends.mps.is_available(): + device = "mps" + whisper_compute_type = "float32" + print("MPS is available (Apple Silicon)") else: - return f"I heard you say something about that. Can you tell me more?" + device = "cpu" + whisper_compute_type = "int8" + print("Using CPU (CUDA/MPS not available)") +except Exception as e: + print(f"Error initializing CUDA: {e}") + print("Falling back to CPU") + device = "cpu" + whisper_compute_type = "int8" + +print(f"Using device: {device}") + +# Initialize models with proper error handling +whisper_model = None +csm_generator = None +llm_model = None +llm_tokenizer = None + +def load_models(): + global whisper_model, csm_generator, llm_model, llm_tokenizer + + # Initialize Faster-Whisper for transcription + try: + print("Loading Whisper model...") + # Import here to avoid immediate import errors if package is missing + from faster_whisper import WhisperModel + # Force CPU for Whisper if we had CUDA issues + whisper_device = device if device != "cpu" else "cpu" + whisper_model = WhisperModel("base", device=whisper_device, compute_type=whisper_compute_type, download_root="./models/whisper") + print("Whisper model loaded successfully") + except Exception as e: + print(f"Error loading Whisper model: {e}") + print("Will use backup speech recognition method if available") + + # Initialize CSM model for audio generation + try: + print("Loading CSM model...") + # Force CPU for CSM if we had CUDA issues + csm_device = device if device != "cpu" else "cpu" + csm_generator = load_csm_1b(device=csm_device) + print("CSM model loaded successfully") + except Exception as e: + print(f"Error loading CSM model: {e}") + print("Audio generation will not be available") + + # Initialize Llama 3.2 model for response generation + try: + print("Loading Llama 3.2 model...") + llm_model_id = "meta-llama/Llama-3.2-1B" # Choose appropriate size based on resources + llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama") + # Force CPU for LLM if we had CUDA issues + llm_device = device if device != "cpu" else "cpu" + llm_model = AutoModelForCausalLM.from_pretrained( + llm_model_id, + torch_dtype=torch.bfloat16 if llm_device != "cpu" else torch.float32, + device_map=llm_device, + cache_dir="./models/llama", + low_cpu_mem_usage=True + ) + print("Llama 3.2 model loaded successfully") + except Exception as e: + print(f"Error loading Llama 3.2 model: {e}") + print("Will use a fallback response generation method") + +# Store conversation context +conversation_context = {} # session_id -> context -# Flask Routes @app.route('/') def index(): - return send_from_directory(base_dir, 'index.html') + return render_template('index.html') -@app.route('/favicon.ico') -def favicon(): - if os.path.exists(os.path.join(static_dir, 'favicon.ico')): - return send_from_directory(static_dir, 'favicon.ico') - return Response(status=204) - -@app.route('/voice-chat.js') -def voice_chat_js(): - return send_from_directory(base_dir, 'voice-chat.js') - -@app.route('/static/') -def serve_static(path): - return send_from_directory(static_dir, path) - -# Socket.IO Event Handlers @socketio.on('connect') def handle_connect(): - client_id = request.sid - logger.info(f"Client connected: {client_id}") - - # Initialize client context - active_clients[client_id] = { - 'context_segments': [], - 'streaming_buffer': [], - 'is_streaming': False, - 'is_silence': False, - 'last_active_time': time.time(), - 'energy_window': deque(maxlen=10) + print(f"Client connected: {request.sid}") + conversation_context[request.sid] = { + 'segments': [], + 'speakers': [0, 1], # 0 = user, 1 = bot + 'audio_buffer': deque(maxlen=10), # Store recent audio chunks + 'is_speaking': False, + 'silence_start': None } - - emit('status', {'type': 'connected', 'message': 'Connected to server'}) + emit('ready', {'message': 'Connection established'}) @socketio.on('disconnect') def handle_disconnect(): - client_id = request.sid - if client_id in active_clients: - del active_clients[client_id] - logger.info(f"Client disconnected: {client_id}") + print(f"Client disconnected: {request.sid}") + if request.sid in conversation_context: + del conversation_context[request.sid] -@socketio.on('generate') -def handle_generate(data): - client_id = request.sid - if client_id not in active_clients: - emit('error', {'message': 'Client not registered'}) +@socketio.on('start_speaking') +def handle_start_speaking(): + if request.sid in conversation_context: + conversation_context[request.sid]['is_speaking'] = True + conversation_context[request.sid]['audio_buffer'].clear() + print(f"User {request.sid} started speaking") + +@socketio.on('audio_chunk') +def handle_audio_chunk(data): + if request.sid not in conversation_context: return - try: - text = data.get('text', '') - speaker_id = data.get('speaker', 0) - - logger.info(f"Generating audio for: '{text}' with speaker {speaker_id}") - - # Generate audio response - audio_tensor = generator.generate( - text=text, - speaker=speaker_id, - context=active_clients[client_id]['context_segments'], - max_audio_length_ms=10_000, - ) - - # Add to conversation context - active_clients[client_id]['context_segments'].append( - Segment(text=text, speaker=speaker_id, audio=audio_tensor) - ) - - # Convert audio to base64 and send back to client - audio_base64 = encode_audio_data(audio_tensor) - emit('audio_response', { - 'type': 'audio_response', - 'audio': audio_base64, - 'text': text - }) - - except Exception as e: - logger.error(f"Error generating audio: {e}") - emit('error', { - 'type': 'error', - 'message': f"Error generating audio: {str(e)}" - }) + context = conversation_context[request.sid] + + # Decode audio data + audio_data = base64.b64decode(data['audio']) + audio_numpy = np.frombuffer(audio_data, dtype=np.float32) + audio_tensor = torch.tensor(audio_numpy) + + # Add to buffer + context['audio_buffer'].append(audio_tensor) + + # Check for silence to detect end of speech + if context['is_speaking'] and is_silence(audio_tensor): + if context['silence_start'] is None: + context['silence_start'] = time.time() + elif time.time() - context['silence_start'] > 1.0: # 1 second of silence + # Process the complete utterance + process_user_utterance(request.sid) + else: + context['silence_start'] = None -@socketio.on('add_to_context') -def handle_add_to_context(data): - client_id = request.sid - if client_id not in active_clients: - emit('error', {'message': 'Client not registered'}) +@socketio.on('stop_speaking') +def handle_stop_speaking(): + if request.sid in conversation_context: + conversation_context[request.sid]['is_speaking'] = False + process_user_utterance(request.sid) + print(f"User {request.sid} stopped speaking") + +def is_silence(audio_tensor, threshold=0.02): + """Check if an audio chunk is silence based on amplitude threshold""" + return torch.mean(torch.abs(audio_tensor)) < threshold + +def process_user_utterance(session_id): + """Process completed user utterance, generate response and send audio back""" + context = conversation_context[session_id] + + if not context['audio_buffer']: return - try: - text = data.get('text', '') - speaker_id = data.get('speaker', 0) - audio_data = data.get('audio', '') - - # Convert received audio to tensor - audio_tensor = decode_audio_data(audio_data) - - # Add to conversation context - active_clients[client_id]['context_segments'].append( - Segment(text=text, speaker=speaker_id, audio=audio_tensor) - ) - - emit('context_updated', { - 'type': 'context_updated', - 'message': 'Audio added to context' - }) - - except Exception as e: - logger.error(f"Error adding to context: {e}") - emit('error', { - 'type': 'error', - 'message': f"Error processing audio: {str(e)}" - }) - -@socketio.on('clear_context') -def handle_clear_context(): - client_id = request.sid - if client_id in active_clients: - active_clients[client_id]['context_segments'] = [] - - emit('context_updated', { - 'type': 'context_updated', - 'message': 'Context cleared' - }) - -@socketio.on('stream_audio') -def handle_stream_audio(data): - client_id = request.sid - if client_id not in active_clients: - emit('error', {'message': 'Client not registered'}) - return + # Combine audio chunks + full_audio = torch.cat(list(context['audio_buffer']), dim=0) + context['audio_buffer'].clear() + context['is_speaking'] = False + context['silence_start'] = None - client = active_clients[client_id] + # Save audio to temporary WAV file for transcription + temp_audio_path = f"temp_audio_{session_id}.wav" + torchaudio.save( + temp_audio_path, + full_audio.unsqueeze(0), + 44100 # Assuming 44.1kHz from client + ) try: - speaker_id = data.get('speaker', 0) - audio_data = data.get('audio', '') - - # Skip if no audio data (might be just a connection test) - if not audio_data: - logger.debug("Empty audio data received, ignoring") - return - - # Convert received audio to tensor - audio_chunk = decode_audio_data(audio_data) - - # Start streaming mode if not already started - if not client['is_streaming']: - client['is_streaming'] = True - client['streaming_buffer'] = [] - client['energy_window'].clear() - client['is_silence'] = False - client['last_active_time'] = time.time() - logger.info(f"[{client_id[:8]}] Streaming started with speaker ID: {speaker_id}") - emit('streaming_status', { - 'type': 'streaming_status', - 'status': 'started' - }) - - # Calculate audio energy for silence detection - chunk_energy = torch.mean(torch.abs(audio_chunk)).item() - client['energy_window'].append(chunk_energy) - avg_energy = sum(client['energy_window']) / len(client['energy_window']) - - # Check if audio is silent - current_silence = avg_energy < SILENCE_THRESHOLD - - # Track silence transition - if not client['is_silence'] and current_silence: - # Transition to silence - client['is_silence'] = True - client['last_active_time'] = time.time() - elif client['is_silence'] and not current_silence: - # User started talking again - client['is_silence'] = False - - # Add chunk to buffer regardless of silence state - client['streaming_buffer'].append(audio_chunk) - - # Check if silence has persisted long enough to consider "stopped talking" - silence_elapsed = time.time() - client['last_active_time'] - - if client['is_silence'] and silence_elapsed >= SILENCE_DURATION_SEC and len(client['streaming_buffer']) > 0: - # User has stopped talking - process the collected audio - logger.info(f"[{client_id[:8]}] Processing audio after {silence_elapsed:.2f}s of silence") - process_complete_utterance(client_id, client, speaker_id) - - # If buffer gets too large without silence, process it anyway - elif len(client['streaming_buffer']) >= MAX_BUFFER_SIZE: - logger.info(f"[{client_id[:8]}] Processing long audio segment without silence") - process_complete_utterance(client_id, client, speaker_id, is_incomplete=True) - - # Keep half of the buffer for context (sliding window approach) - half_point = len(client['streaming_buffer']) // 2 - client['streaming_buffer'] = client['streaming_buffer'][half_point:] - - except Exception as e: - import traceback - traceback.print_exc() - logger.error(f"Error processing streaming audio: {e}") - emit('error', { - 'type': 'error', - 'message': f"Error processing streaming audio: {str(e)}" - }) - -def process_complete_utterance(client_id, client, speaker_id, is_incomplete=False): - """Process a complete utterance (after silence or buffer limit)""" - try: - # Combine audio chunks - full_audio = torch.cat(client['streaming_buffer'], dim=0) - - # Process audio to generate a response (using speech recognition) - generated_text = process_speech(full_audio, client_id) - - # Add suffix for incomplete utterances - if is_incomplete: - generated_text += " (processing continued speech...)" - - # Log the generated text - logger.info(f"[{client_id[:8]}] Generated text: '{generated_text}'") - - # Handle the result - if generated_text: - # Add user message to context - user_segment = Segment(text=generated_text, speaker=speaker_id, audio=full_audio) - client['context_segments'].append(user_segment) - - # Send the text to client - emit('transcription', { - 'type': 'transcription', - 'text': generated_text - }, room=client_id) - - # Only generate a response if this is a complete utterance - if not is_incomplete: - # Generate a contextual response - response_text = generate_response(generated_text, client['context_segments']) - logger.info(f"[{client_id[:8]}] Generating response: '{response_text}'") - - # Let the client know we're processing - emit('processing_status', { - 'type': 'processing_status', - 'status': 'generating_audio', - 'message': 'Generating audio response...' - }, room=client_id) - - # Generate audio for the response - try: - # Use a different speaker than the user - ai_speaker_id = 1 if speaker_id == 0 else 0 - - # Generate the full response - audio_tensor = generator.generate( - text=response_text, - speaker=ai_speaker_id, - context=client['context_segments'], - max_audio_length_ms=10_000, - ) - - # Add response to context - ai_segment = Segment( - text=response_text, - speaker=ai_speaker_id, - audio=audio_tensor - ) - client['context_segments'].append(ai_segment) - - # CHANGE HERE: Use the streaming function instead of sending all at once - # Check if the audio is short enough to send at once or if it should be streamed - if audio_tensor.size(0) < generator.sample_rate * 2: # Less than 2 seconds - # For short responses, just send in one go for better responsiveness - audio_base64 = encode_audio_data(audio_tensor) - emit('audio_response', { - 'type': 'audio_response', - 'text': response_text, - 'audio': audio_base64 - }, room=client_id) - logger.info(f"[{client_id[:8]}] Short audio response sent in one piece") - else: - # For longer responses, use streaming - logger.info(f"[{client_id[:8]}] Using streaming for audio response") - # Start a new thread for streaming to avoid blocking the main thread - import threading - stream_thread = threading.Thread( - target=stream_audio_to_client, - args=(client_id, audio_tensor, response_text, ai_speaker_id) - ) - stream_thread.start() - - except Exception as e: - logger.error(f"Error generating audio response: {e}") - emit('error', { - 'type': 'error', - 'message': "Sorry, there was an error generating the audio response." - }, room=client_id) + # Try using Whisper first if available + if whisper_model is not None: + user_text = transcribe_with_whisper(temp_audio_path) else: - # If processing failed, send a notification - emit('error', { - 'type': 'error', - 'message': "Sorry, I couldn't understand what you said. Could you try again?" - }, room=client_id) + # Fallback to Google's speech recognition + user_text = transcribe_with_google(temp_audio_path) - # Only clear buffer for complete utterances - if not is_incomplete: - # Reset state - client['streaming_buffer'] = [] - client['energy_window'].clear() - client['is_silence'] = False - client['last_active_time'] = time.time() - - except Exception as e: - logger.error(f"Error processing utterance: {e}") - emit('error', { - 'type': 'error', - 'message': f"Error processing audio: {str(e)}" - }, room=client_id) - -@socketio.on('stop_streaming') -def handle_stop_streaming(data): - client_id = request.sid - if client_id not in active_clients: - return - - client = active_clients[client_id] - client['is_streaming'] = False - - if client['streaming_buffer'] and len(client['streaming_buffer']) > 5: - # Process any remaining audio in the buffer - logger.info(f"[{client_id[:8]}] Processing final audio buffer on stop") - process_complete_utterance(client_id, client, data.get("speaker", 0)) - - client['streaming_buffer'] = [] - emit('streaming_status', { - 'type': 'streaming_status', - 'status': 'stopped' - }) - -def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=CHUNK_SIZE_MS): - """Stream audio to client in chunks to simulate real-time generation""" - try: - if client_id not in active_clients: - logger.warning(f"Client {client_id} not found for streaming") + if not user_text: + print("No speech detected.") + emit('error', {'message': 'No speech detected. Please try again.'}, room=session_id) return - # Calculate chunk size in samples - chunk_size = int(generator.sample_rate * chunk_size_ms / 1000) - total_chunks = math.ceil(audio_tensor.size(0) / chunk_size) + print(f"Transcribed: {user_text}") - logger.info(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each") + # Add to conversation segments + user_segment = Segment( + text=user_text, + speaker=0, # User is speaker 0 + audio=full_audio + ) + context['segments'].append(user_segment) - # Send initial response with text but no audio yet - socketio.emit('audio_response_start', { - 'type': 'audio_response_start', - 'text': text, - 'total_chunks': total_chunks - }, room=client_id) + # Generate bot response + bot_response = generate_llm_response(user_text, context['segments']) + print(f"Bot response: {bot_response}") - # Stream each chunk - for i in range(total_chunks): - start_idx = i * chunk_size - end_idx = min(start_idx + chunk_size, audio_tensor.size(0)) - - # Extract chunk - chunk = audio_tensor[start_idx:end_idx] - - # Encode chunk - chunk_base64 = encode_audio_data(chunk) - - # Send chunk - socketio.emit('audio_response_chunk', { - 'type': 'audio_response_chunk', - 'chunk_index': i, - 'total_chunks': total_chunks, - 'audio': chunk_base64, - 'is_last': i == total_chunks - 1 - }, room=client_id) - - # Brief pause between chunks to simulate streaming - time.sleep(0.1) - - # Send completion message - socketio.emit('audio_response_complete', { - 'type': 'audio_response_complete', - 'text': text - }, room=client_id) + # Send transcribed text to client + emit('transcription', {'text': user_text}, room=session_id) - logger.info(f"Audio streaming complete: {total_chunks} chunks sent") + # Generate and send audio response if CSM is available + if csm_generator is not None: + # Convert to audio using CSM + bot_audio = generate_audio_response(bot_response, context['segments']) + + # Convert audio to base64 for sending over websocket + audio_bytes = io.BytesIO() + torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav") + audio_bytes.seek(0) + audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') + + # Add bot response to conversation history + bot_segment = Segment( + text=bot_response, + speaker=1, # Bot is speaker 1 + audio=bot_audio + ) + context['segments'].append(bot_segment) + + # Send audio response to client + emit('audio_response', { + 'audio': audio_b64, + 'text': bot_response + }, room=session_id) + else: + # Send text-only response if audio generation isn't available + emit('text_response', {'text': bot_response}, room=session_id) + + # Add text-only bot response to conversation history + bot_segment = Segment( + text=bot_response, + speaker=1, # Bot is speaker 1 + audio=torch.zeros(1) # Placeholder empty audio + ) + context['segments'].append(bot_segment) except Exception as e: - logger.error(f"Error streaming audio to client: {e}") - import traceback - traceback.print_exc() + print(f"Error processing speech: {e}") + emit('error', {'message': f'Error processing speech: {str(e)}'}, room=session_id) + finally: + # Cleanup temp file + if os.path.exists(temp_audio_path): + os.remove(temp_audio_path) -# Main server start -if __name__ == "__main__": - print(f"\n{'='*60}") - print(f"🔊 Sesame AI Voice Chat Server") - print(f"{'='*60}") - print(f"📡 Server Information:") - print(f" - Local URL: http://localhost:5000") - print(f" - Network URL: http://:5000") - print(f"{'='*60}") - print(f"🌐 Device: {device.upper()}") - print(f"🧠 Models: Sesame CSM (TTS only)") - print(f"🔧 Serving from: {os.path.join(base_dir, 'index.html')}") - print(f"{'='*60}") - print(f"Ready to receive connections! Press Ctrl+C to stop the server.\n") +def transcribe_with_whisper(audio_path): + """Transcribe audio using Faster-Whisper""" + segments, info = whisper_model.transcribe(audio_path, beam_size=5) - socketio.run(app, host="0.0.0.0", port=5000, debug=False) \ No newline at end of file + # Collect all text from segments + user_text = "" + for segment in segments: + segment_text = segment.text.strip() + print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment_text}") + user_text += segment_text + " " + + return user_text.strip() + +def transcribe_with_google(audio_path): + """Fallback transcription using Google's speech recognition""" + import speech_recognition as sr + recognizer = sr.Recognizer() + + with sr.AudioFile(audio_path) as source: + audio = recognizer.record(source) + try: + text = recognizer.recognize_google(audio) + return text + except sr.UnknownValueError: + return "" + except sr.RequestError: + # If Google API fails, try a basic energy-based VAD approach + # This is a very basic fallback and won't give good results + return "[Speech detected but transcription failed]" + +def generate_llm_response(user_text, conversation_segments): + """Generate text response using available model""" + if llm_model is not None and llm_tokenizer is not None: + # Format conversation history for the LLM + conversation_history = "" + for segment in conversation_segments[-5:]: # Use last 5 utterances for context + speaker_name = "User" if segment.speaker == 0 else "Assistant" + conversation_history += f"{speaker_name}: {segment.text}\n" + + # Add the current user query + conversation_history += f"User: {user_text}\nAssistant:" + + try: + # Generate response + inputs = llm_tokenizer(conversation_history, return_tensors="pt").to(device) + output = llm_model.generate( + inputs.input_ids, + max_new_tokens=150, + temperature=0.7, + top_p=0.9, + do_sample=True + ) + + response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + return response.strip() + except Exception as e: + print(f"Error generating response with LLM: {e}") + return fallback_response(user_text) + else: + return fallback_response(user_text) + +def fallback_response(user_text): + """Generate a simple fallback response when LLM is not available""" + # Simple rule-based responses + user_text_lower = user_text.lower() + + if "hello" in user_text_lower or "hi" in user_text_lower: + return "Hello! I'm a simple fallback assistant. The main language model couldn't be loaded, so I have limited capabilities." + + elif "how are you" in user_text_lower: + return "I'm functioning within my limited capabilities. How can I assist you today?" + + elif "thank" in user_text_lower: + return "You're welcome! Let me know if there's anything else I can help with." + + elif "bye" in user_text_lower or "goodbye" in user_text_lower: + return "Goodbye! Have a great day!" + + elif any(q in user_text_lower for q in ["what", "who", "where", "when", "why", "how"]): + return "I'm running in fallback mode and can't answer complex questions. Please try again when the main language model is available." + + else: + return "I understand you said something about that. Unfortunately, I'm running in fallback mode with limited capabilities. Please try again later when the main model is available." + +def generate_audio_response(text, conversation_segments): + """Generate audio response using CSM""" + try: + # Use the last few conversation segments as context + context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments + + # Generate audio for bot response + audio = csm_generator.generate( + text=text, + speaker=1, # Bot is speaker 1 + context=context_segments, + max_audio_length_ms=10000, # 10 seconds max + temperature=0.9, + topk=50 + ) + + return audio + except Exception as e: + print(f"Error generating audio: {e}") + # Return silence as fallback + return torch.zeros(csm_generator.sample_rate * 3) # 3 seconds of silence + +if __name__ == '__main__': + # Ensure the existing index.html file is in the correct location + if not os.path.exists('templates'): + os.makedirs('templates') + + if os.path.exists('index.html') and not os.path.exists('templates/index.html'): + os.rename('index.html', 'templates/index.html') + + # Load models asynchronously before starting the server + print("Starting model loading...") + # In a production environment, you could load models in a separate thread + load_models() + + # Start the server + print("Starting Flask SocketIO server...") + socketio.run(app, host='0.0.0.0', port=5000, debug=False) \ No newline at end of file diff --git a/Backend/setup.py b/Backend/setup.py deleted file mode 100644 index 8eddb95..0000000 --- a/Backend/setup.py +++ /dev/null @@ -1,13 +0,0 @@ -from setuptools import setup, find_packages -import os - -# Read requirements from requirements.txt -with open('requirements.txt') as f: - requirements = [line.strip() for line in f if line.strip() and not line.startswith('#')] - -setup( - name='csm', - version='0.1.0', - packages=find_packages(), - install_requires=requirements, -) diff --git a/Backend/test.py b/Backend/test.py deleted file mode 100644 index 34735b1..0000000 --- a/Backend/test.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -import torch -import torchaudio -from huggingface_hub import hf_hub_download -from generator import load_csm_1b, Segment -from dataclasses import dataclass - -if torch.backends.mps.is_available(): - device = "mps" -elif torch.cuda.is_available(): - device = "cuda" -else: - device = "cpu" - -generator = load_csm_1b(device=device) - -speakers = [0, 1, 0, 0] -transcripts = [ - "Hey how are you doing.", - "Pretty good, pretty good.", - "I'm great.", - "So happy to be speaking to you.", -] -audio_paths = [ - "utterance_0.wav", - "utterance_1.wav", - "utterance_2.wav", - "utterance_3.wav", -] - -def load_audio(audio_path): - audio_tensor, sample_rate = torchaudio.load(audio_path) - audio_tensor = torchaudio.functional.resample( - audio_tensor.squeeze(0), orig_freq=sample_rate, new_freq=generator.sample_rate - ) - return audio_tensor - -segments = [ - Segment(text=transcript, speaker=speaker, audio=load_audio(audio_path)) - for transcript, speaker, audio_path in zip(transcripts, speakers, audio_paths) -] - -audio = generator.generate( - text="Me too, this is some cool stuff huh?", - speaker=1, - context=segments, - max_audio_length_ms=10_000, -) - -torchaudio.save("audio.wav", audio.unsqueeze(0).cpu(), generator.sample_rate) \ No newline at end of file diff --git a/Backend/voice-chat.js b/Backend/voice-chat.js deleted file mode 100644 index 89ec71a..0000000 --- a/Backend/voice-chat.js +++ /dev/null @@ -1,1071 +0,0 @@ -/** - * Sesame AI Voice Chat Client - * - * A web client that connects to a Sesame AI voice chat server and enables - * real-time voice conversation with an AI assistant. - */ - -// Configuration constants -const SERVER_URL = window.location.hostname === 'localhost' ? - 'http://localhost:5000' : window.location.origin; -const ENERGY_WINDOW_SIZE = 15; -const CLIENT_SILENCE_DURATION_MS = 750; - -// DOM elements -const elements = { - conversation: null, - streamButton: null, - clearButton: null, - thresholdSlider: null, - thresholdValue: null, - visualizerCanvas: null, - visualizerLabel: null, - volumeLevel: null, - statusDot: null, - statusText: null, - speakerSelection: null, - autoPlayResponses: null, - showVisualizer: null -}; - -// Application state -const state = { - socket: null, - audioContext: null, - analyser: null, - microphone: null, - streamProcessor: null, - isStreaming: false, - isSpeaking: false, - silenceThreshold: 0.01, - energyWindow: [], - silenceTimer: null, - volumeUpdateInterval: null, - visualizerAnimationFrame: null, - currentSpeaker: 0 -}; - -// Visualizer variables -let canvasContext = null; -let visualizerBufferLength = 0; -let visualizerDataArray = null; - -// New state variables to track incremental audio streaming -const streamingAudio = { - messageElement: null, - audioElement: null, - chunks: [], - totalChunks: 0, - receivedChunks: 0, - text: '', - mediaSource: null, - sourceBuffer: null, - audioContext: null, - complete: false -}; - -// Initialize the application -function initializeApp() { - // Initialize the UI elements - initializeUIElements(); - - // Initialize socket.io connection - setupSocketConnection(); - - // Setup event listeners - setupEventListeners(); - - // Initialize visualizer - setupVisualizer(); - - // Show welcome message - addSystemMessage('Welcome to Sesame AI Voice Chat! Click "Start Conversation" to begin.'); -} - -// Initialize UI elements -function initializeUIElements() { - // Store references to UI elements - elements.conversation = document.getElementById('conversation'); - elements.streamButton = document.getElementById('streamButton'); - elements.clearButton = document.getElementById('clearButton'); - elements.thresholdSlider = document.getElementById('thresholdSlider'); - elements.thresholdValue = document.getElementById('thresholdValue'); - elements.visualizerCanvas = document.getElementById('audioVisualizer'); - elements.visualizerLabel = document.getElementById('visualizerLabel'); - elements.volumeLevel = document.getElementById('volumeLevel'); - elements.statusDot = document.getElementById('statusDot'); - elements.statusText = document.getElementById('statusText'); - elements.speakerSelection = document.getElementById('speakerSelect'); // Changed to match HTML - elements.autoPlayResponses = document.getElementById('autoPlayResponses'); - elements.showVisualizer = document.getElementById('showVisualizer'); -} - -// Setup Socket.IO connection -function setupSocketConnection() { - state.socket = io(SERVER_URL); - - // Connection events - state.socket.on('connect', () => { - console.log('Connected to server'); - updateConnectionStatus(true); - }); - - state.socket.on('disconnect', () => { - console.log('Disconnected from server'); - updateConnectionStatus(false); - - // Stop streaming if active - if (state.isStreaming) { - stopStreaming(false); - } - }); - - state.socket.on('error', (data) => { - console.error('Socket error:', data.message); - addSystemMessage(`Error: ${data.message}`); - }); - - // Register message handlers - state.socket.on('audio_response', handleAudioResponse); - state.socket.on('transcription', handleTranscription); - state.socket.on('context_updated', handleContextUpdate); - state.socket.on('streaming_status', handleStreamingStatus); - - // New event handlers for incremental audio streaming - state.socket.on('audio_response_start', handleAudioResponseStart); - state.socket.on('audio_response_chunk', handleAudioResponseChunk); - state.socket.on('audio_response_complete', handleAudioResponseComplete); - state.socket.on('processing_status', handleProcessingStatus); -} - -// Setup event listeners -function setupEventListeners() { - // Stream button - elements.streamButton.addEventListener('click', toggleStreaming); - - // Clear button - elements.clearButton.addEventListener('click', clearConversation); - - // Threshold slider - elements.thresholdSlider.addEventListener('input', updateThreshold); - - // Speaker selection - elements.speakerSelection.addEventListener('change', () => { - state.currentSpeaker = parseInt(elements.speakerSelection.value, 10); - }); - - // Visualizer toggle - elements.showVisualizer.addEventListener('change', toggleVisualizerVisibility); -} - -// Setup audio visualizer -function setupVisualizer() { - if (!elements.visualizerCanvas) return; - - canvasContext = elements.visualizerCanvas.getContext('2d'); - - // Set canvas dimensions - elements.visualizerCanvas.width = elements.visualizerCanvas.offsetWidth; - elements.visualizerCanvas.height = elements.visualizerCanvas.offsetHeight; - - // Initialize the visualizer - drawVisualizer(); -} - -// Update connection status UI -function updateConnectionStatus(isConnected) { - elements.statusDot.classList.toggle('active', isConnected); - elements.statusText.textContent = isConnected ? 'Connected' : 'Disconnected'; -} - -// Toggle streaming state -function toggleStreaming() { - if (state.isStreaming) { - stopStreaming(true); - } else { - startStreaming(); - } -} - -// Start streaming audio to the server -function startStreaming() { - if (state.isStreaming) return; - - // Request microphone access - navigator.mediaDevices.getUserMedia({ audio: true, video: false }) - .then(stream => { - // Show processing state while setting up - elements.streamButton.innerHTML = ' Initializing...'; - - // Create audio context - state.audioContext = new (window.AudioContext || window.webkitAudioContext)(); - - // Create microphone source - state.microphone = state.audioContext.createMediaStreamSource(stream); - - // Create analyser for visualizer - state.analyser = state.audioContext.createAnalyser(); - state.analyser.fftSize = 256; - visualizerBufferLength = state.analyser.frequencyBinCount; - visualizerDataArray = new Uint8Array(visualizerBufferLength); - - // Connect microphone to analyser - state.microphone.connect(state.analyser); - - // Create script processor for audio processing - const bufferSize = 4096; - state.streamProcessor = state.audioContext.createScriptProcessor(bufferSize, 1, 1); - - // Set up audio processing callback - state.streamProcessor.onaudioprocess = handleAudioProcess; - - // Connect the processors - state.analyser.connect(state.streamProcessor); - state.streamProcessor.connect(state.audioContext.destination); - - // Update UI - state.isStreaming = true; - elements.streamButton.innerHTML = ' Listening...'; - elements.streamButton.classList.add('recording'); - - // Initialize energy window - state.energyWindow = []; - - // Start volume meter updates - state.volumeUpdateInterval = setInterval(updateVolumeMeter, 100); - - // Start visualizer if enabled - if (elements.showVisualizer.checked && !state.visualizerAnimationFrame) { - drawVisualizer(); - } - - // Show starting message - addSystemMessage('Listening... Speak clearly into your microphone.'); - - // Notify the server that we're starting - state.socket.emit('stream_audio', { - audio: '', - speaker: state.currentSpeaker - }); - }) - .catch(err => { - console.error('Error accessing microphone:', err); - addSystemMessage(`Error: ${err.message}. Please make sure your microphone is connected and you've granted permission.`); - elements.streamButton.innerHTML = ' Start Conversation'; - }); -} - -// Stop streaming audio -function stopStreaming(notifyServer = true) { - if (!state.isStreaming) return; - - // Update UI first - elements.streamButton.innerHTML = ' Start Conversation'; - elements.streamButton.classList.remove('recording'); - elements.streamButton.classList.remove('processing'); - - // Stop volume meter updates - if (state.volumeUpdateInterval) { - clearInterval(state.volumeUpdateInterval); - state.volumeUpdateInterval = null; - } - - // Stop all audio processing - if (state.streamProcessor) { - state.streamProcessor.disconnect(); - state.streamProcessor = null; - } - - if (state.analyser) { - state.analyser.disconnect(); - } - - if (state.microphone) { - state.microphone.disconnect(); - } - - // Close audio context - if (state.audioContext && state.audioContext.state !== 'closed') { - state.audioContext.close().catch(err => console.warn('Error closing audio context:', err)); - } - - // Cleanup animation frames - if (state.visualizerAnimationFrame) { - cancelAnimationFrame(state.visualizerAnimationFrame); - state.visualizerAnimationFrame = null; - } - - // Reset state - state.isStreaming = false; - state.isSpeaking = false; - - // Notify the server - if (notifyServer && state.socket && state.socket.connected) { - state.socket.emit('stop_streaming', { - speaker: state.currentSpeaker - }); - } - - // Show message - addSystemMessage('Conversation paused. Click "Start Conversation" to resume.'); -} - -// Handle audio processing -function handleAudioProcess(event) { - const inputData = event.inputBuffer.getChannelData(0); - - // Calculate audio energy (volume level) - const energy = calculateAudioEnergy(inputData); - - // Update energy window for averaging - updateEnergyWindow(energy); - - // Calculate average energy - const avgEnergy = calculateAverageEnergy(); - - // Determine if audio is silent - const isSilent = avgEnergy < state.silenceThreshold; - - // Debug logging only if significant changes in audio patterns - if (Math.random() < 0.05) { // Log only 5% of frames to avoid console spam - console.log(`Audio: len=${inputData.length}, energy=${energy.toFixed(4)}, avg=${avgEnergy.toFixed(4)}, silent=${isSilent}`); - } - - // Handle speech state based on silence - handleSpeechState(isSilent); - - // Only send audio chunk if we detect speech - if (!isSilent) { - // Create a resampled version at 24kHz for the server - // Most WebRTC audio is 48kHz, but we want 24kHz for the model - const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000); - - // Send the audio chunk to the server - sendAudioChunk(resampledData, state.currentSpeaker); - } -} - -// Cleanup audio resources when done -function cleanupAudioResources() { - // Stop all audio processing - if (state.streamProcessor) { - state.streamProcessor.disconnect(); - state.streamProcessor = null; - } - - if (state.analyser) { - state.analyser.disconnect(); - state.analyser = null; - } - - if (state.microphone) { - state.microphone.disconnect(); - state.microphone = null; - } - - // Close audio context - if (state.audioContext && state.audioContext.state !== 'closed') { - state.audioContext.close().catch(err => console.warn('Error closing audio context:', err)); - } - - // Cancel all timers and animation frames - if (state.volumeUpdateInterval) { - clearInterval(state.volumeUpdateInterval); - state.volumeUpdateInterval = null; - } - - if (state.visualizerAnimationFrame) { - cancelAnimationFrame(state.visualizerAnimationFrame); - state.visualizerAnimationFrame = null; - } - - if (state.silenceTimer) { - clearTimeout(state.silenceTimer); - state.silenceTimer = null; - } -} - -// Clear conversation history -function clearConversation() { - if (elements.conversation) { - elements.conversation.innerHTML = ''; - addSystemMessage('Conversation cleared.'); - - // Notify server to clear context - if (state.socket && state.socket.connected) { - state.socket.emit('clear_context'); - } - } -} - -// Calculate audio energy (volume) -function calculateAudioEnergy(buffer) { - let sum = 0; - for (let i = 0; i < buffer.length; i++) { - sum += buffer[i] * buffer[i]; - } - return Math.sqrt(sum / buffer.length); -} - -// Update energy window for averaging -function updateEnergyWindow(energy) { - state.energyWindow.push(energy); - if (state.energyWindow.length > ENERGY_WINDOW_SIZE) { - state.energyWindow.shift(); - } -} - -// Calculate average energy from window -function calculateAverageEnergy() { - if (state.energyWindow.length === 0) return 0; - - const sum = state.energyWindow.reduce((a, b) => a + b, 0); - return sum / state.energyWindow.length; -} - -// Update the threshold from the slider -function updateThreshold() { - state.silenceThreshold = parseFloat(elements.thresholdSlider.value); - elements.thresholdValue.textContent = state.silenceThreshold.toFixed(3); -} - -// Update the volume meter display -function updateVolumeMeter() { - if (!state.isStreaming || !state.energyWindow.length) return; - - const avgEnergy = calculateAverageEnergy(); - - // Scale energy to percentage (0-100) - // Typically, energy values will be very small (e.g., 0.001 to 0.1) - // So we multiply by a factor to make it more visible - const scaleFactor = 1000; - const percentage = Math.min(100, Math.max(0, avgEnergy * scaleFactor)); - - // Update volume meter width - elements.volumeLevel.style.width = `${percentage}%`; - - // Change color based on level - if (percentage > 70) { - elements.volumeLevel.style.backgroundColor = '#ff5252'; - } else if (percentage > 30) { - elements.volumeLevel.style.backgroundColor = '#4CAF50'; - } else { - elements.volumeLevel.style.backgroundColor = '#4c84ff'; - } -} - -// Handle speech/silence state transitions -function handleSpeechState(isSilent) { - if (state.isSpeaking && isSilent) { - // Transition from speaking to silence - if (!state.silenceTimer) { - state.silenceTimer = setTimeout(() => { - // Only consider it a real silence after a certain duration - // This prevents detecting brief pauses as the end of speech - state.isSpeaking = false; - state.silenceTimer = null; - }, CLIENT_SILENCE_DURATION_MS); - } - } else if (state.silenceTimer && !isSilent) { - // User started speaking again, cancel the silence timer - clearTimeout(state.silenceTimer); - state.silenceTimer = null; - } - - // Update speaking state for non-silent audio - if (!isSilent) { - state.isSpeaking = true; - } -} - -// Send audio chunk to server -function sendAudioChunk(audioData, speaker) { - if (!state.socket || !state.socket.connected) { - console.warn('Socket not connected'); - return; - } - - console.log(`Preparing audio chunk: length=${audioData.length}, speaker=${speaker}`); - - // Check for NaN or invalid values - let hasInvalidValues = false; - for (let i = 0; i < audioData.length; i++) { - if (isNaN(audioData[i]) || !isFinite(audioData[i])) { - hasInvalidValues = true; - console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`); - break; - } - } - - if (hasInvalidValues) { - console.warn('Audio data contains invalid values. Creating silent audio.'); - audioData = new Float32Array(audioData.length).fill(0); - } - - try { - // Create WAV blob - const wavData = createWavBlob(audioData, 24000); - console.log(`WAV blob created: ${wavData.size} bytes`); - - const reader = new FileReader(); - - reader.onloadend = function() { - try { - // Get base64 data - const base64data = reader.result; - console.log(`Base64 data created: ${base64data.length} bytes`); - - // Send to server - state.socket.emit('stream_audio', { - audio: base64data, - speaker: speaker - }); - console.log('Audio chunk sent to server'); - } catch (err) { - console.error('Error preparing audio data:', err); - } - }; - - reader.onerror = function() { - console.error('Error reading audio data as base64'); - }; - - reader.readAsDataURL(wavData); - } catch (err) { - console.error('Error creating WAV data:', err); - } -} - -// Create WAV blob from audio data with improved error handling -function createWavBlob(audioData, sampleRate) { - // Validate input - if (!audioData || audioData.length === 0) { - console.warn('Empty audio data provided to createWavBlob'); - audioData = new Float32Array(1024).fill(0); // Create 1024 samples of silence - } - - // Function to convert Float32Array to Int16Array for WAV format - function floatTo16BitPCM(output, offset, input) { - for (let i = 0; i < input.length; i++, offset += 2) { - // Ensure values are in -1 to 1 range - const s = Math.max(-1, Math.min(1, input[i])); - // Convert to 16-bit PCM - output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); - } - } - - // Create WAV header - function writeString(view, offset, string) { - for (let i = 0; i < string.length; i++) { - view.setUint8(offset + i, string.charCodeAt(i)); - } - } - - try { - // Create WAV file with header - careful with buffer sizes - const buffer = new ArrayBuffer(44 + audioData.length * 2); - const view = new DataView(buffer); - - // RIFF identifier - writeString(view, 0, 'RIFF'); - - // File length (will be filled later) - view.setUint32(4, 36 + audioData.length * 2, true); - - // WAVE identifier - writeString(view, 8, 'WAVE'); - - // fmt chunk identifier - writeString(view, 12, 'fmt '); - - // fmt chunk length - view.setUint32(16, 16, true); - - // Sample format (1 is PCM) - view.setUint16(20, 1, true); - - // Mono channel - view.setUint16(22, 1, true); - - // Sample rate - view.setUint32(24, sampleRate, true); - - // Byte rate (sample rate * block align) - view.setUint32(28, sampleRate * 2, true); - - // Block align (channels * bytes per sample) - view.setUint16(32, 2, true); - - // Bits per sample - view.setUint16(34, 16, true); - - // data chunk identifier - writeString(view, 36, 'data'); - - // data chunk length - view.setUint32(40, audioData.length * 2, true); - - // Write the PCM samples - floatTo16BitPCM(view, 44, audioData); - - // Create and return blob - return new Blob([view], { type: 'audio/wav' }); - } catch (err) { - console.error('Error in createWavBlob:', err); - - // Create a minimal valid WAV file with silence as fallback - const fallbackSamples = new Float32Array(1024).fill(0); - const fallbackBuffer = new ArrayBuffer(44 + fallbackSamples.length * 2); - const fallbackView = new DataView(fallbackBuffer); - - writeString(fallbackView, 0, 'RIFF'); - fallbackView.setUint32(4, 36 + fallbackSamples.length * 2, true); - writeString(fallbackView, 8, 'WAVE'); - writeString(fallbackView, 12, 'fmt '); - fallbackView.setUint32(16, 16, true); - fallbackView.setUint16(20, 1, true); - fallbackView.setUint16(22, 1, true); - fallbackView.setUint32(24, sampleRate, true); - fallbackView.setUint32(28, sampleRate * 2, true); - fallbackView.setUint16(32, 2, true); - fallbackView.setUint16(34, 16, true); - writeString(fallbackView, 36, 'data'); - fallbackView.setUint32(40, fallbackSamples.length * 2, true); - floatTo16BitPCM(fallbackView, 44, fallbackSamples); - - return new Blob([fallbackView], { type: 'audio/wav' }); - } -} - -// Draw audio visualizer -function drawVisualizer() { - if (!canvasContext) { - return; - } - - state.visualizerAnimationFrame = requestAnimationFrame(drawVisualizer); - - // Skip drawing if visualizer is hidden - if (!elements.showVisualizer.checked) { - if (elements.visualizerCanvas.style.opacity !== '0') { - elements.visualizerCanvas.style.opacity = '0'; - } - return; - } else if (elements.visualizerCanvas.style.opacity !== '1') { - elements.visualizerCanvas.style.opacity = '1'; - } - - // Get frequency data if available - if (state.isStreaming && state.analyser) { - try { - state.analyser.getByteFrequencyData(visualizerDataArray); - } catch (e) { - console.warn('Error getting frequency data:', e); - } - } else { - // Fade out when not streaming - for (let i = 0; i < visualizerDataArray.length; i++) { - visualizerDataArray[i] = Math.max(0, visualizerDataArray[i] - 5); - } - } - - // Clear canvas - canvasContext.fillStyle = 'rgb(0, 0, 0)'; - canvasContext.fillRect(0, 0, elements.visualizerCanvas.width, elements.visualizerCanvas.height); - - // Draw gradient bars - const width = elements.visualizerCanvas.width; - const height = elements.visualizerCanvas.height; - const barCount = Math.min(visualizerBufferLength, 64); - const barWidth = width / barCount - 1; - - for (let i = 0; i < barCount; i++) { - const index = Math.floor(i * visualizerBufferLength / barCount); - const value = visualizerDataArray[index]; - - // Use logarithmic scale for better audio visualization - // This makes low values more visible while still maintaining full range - const logFactor = 20; - const scaledValue = Math.log(1 + (value / 255) * logFactor) / Math.log(1 + logFactor); - const barHeight = scaledValue * height; - - // Position bars - const x = i * (barWidth + 1); - const y = height - barHeight; - - // Create color gradient based on frequency and amplitude - const hue = i / barCount * 360; // Full color spectrum - const saturation = 80 + (value / 255 * 20); // Higher values more saturated - const lightness = 40 + (value / 255 * 20); // Dynamic brightness based on amplitude - - // Draw main bar - canvasContext.fillStyle = `hsl(${hue}, ${saturation}%, ${lightness}%)`; - canvasContext.fillRect(x, y, barWidth, barHeight); - - // Add reflection effect - if (barHeight > 5) { - const gradient = canvasContext.createLinearGradient( - x, y, - x, y + barHeight * 0.5 - ); - gradient.addColorStop(0, `hsla(${hue}, ${saturation}%, ${lightness + 20}%, 0.4)`); - gradient.addColorStop(1, `hsla(${hue}, ${saturation}%, ${lightness}%, 0)`); - canvasContext.fillStyle = gradient; - canvasContext.fillRect(x, y, barWidth, barHeight * 0.5); - - // Add highlight on top of the bar for better 3D effect - canvasContext.fillStyle = `hsla(${hue}, ${saturation - 20}%, ${lightness + 30}%, 0.7)`; - canvasContext.fillRect(x, y, barWidth, 2); - } - } - - // Show/hide the label - elements.visualizerLabel.style.opacity = (state.isStreaming) ? '0' : '0.7'; -} - -// Toggle visualizer visibility -function toggleVisualizerVisibility() { - const isVisible = elements.showVisualizer.checked; - elements.visualizerCanvas.style.opacity = isVisible ? '1' : '0'; - - if (isVisible && state.isStreaming && !state.visualizerAnimationFrame) { - drawVisualizer(); - } -} - -// Handle audio response from server -function handleAudioResponse(data) { - console.log('Received audio response'); - - // Create message container - const messageElement = document.createElement('div'); - messageElement.className = 'message ai'; - - // Add text content if available - if (data.text) { - const textElement = document.createElement('p'); - textElement.textContent = data.text; - messageElement.appendChild(textElement); - } - - // Create and configure audio element - const audioElement = document.createElement('audio'); - audioElement.controls = true; - audioElement.className = 'audio-player'; - - // Set audio source - const audioSource = document.createElement('source'); - audioSource.src = data.audio; - audioSource.type = 'audio/wav'; - - // Add fallback text - audioElement.textContent = 'Your browser does not support the audio element.'; - - // Assemble audio element - audioElement.appendChild(audioSource); - messageElement.appendChild(audioElement); - - // Add timestamp - const timeElement = document.createElement('span'); - timeElement.className = 'message-time'; - timeElement.textContent = new Date().toLocaleTimeString(); - messageElement.appendChild(timeElement); - - // Add to conversation - elements.conversation.appendChild(messageElement); - - // Auto-scroll to bottom - elements.conversation.scrollTop = elements.conversation.scrollHeight; - - // Auto-play if enabled - if (elements.autoPlayResponses.checked) { - audioElement.play() - .catch(err => { - console.warn('Auto-play failed:', err); - addSystemMessage('Auto-play failed. Please click play to hear the response.'); - }); - } - - // Re-enable stream button after processing is complete - if (state.isStreaming) { - elements.streamButton.innerHTML = ' Listening...'; - elements.streamButton.classList.add('recording'); - elements.streamButton.classList.remove('processing'); - } -} - -// Handle transcription response from server -function handleTranscription(data) { - console.log('Received transcription:', data.text); - - // Create message element - const messageElement = document.createElement('div'); - messageElement.className = 'message user'; - - // Add text content - const textElement = document.createElement('p'); - textElement.textContent = data.text; - messageElement.appendChild(textElement); - - // Add timestamp - const timeElement = document.createElement('span'); - timeElement.className = 'message-time'; - timeElement.textContent = new Date().toLocaleTimeString(); - messageElement.appendChild(timeElement); - - // Add to conversation - elements.conversation.appendChild(messageElement); - - // Auto-scroll to bottom - elements.conversation.scrollTop = elements.conversation.scrollHeight; -} - -// Handle context update from server -function handleContextUpdate(data) { - console.log('Context updated:', data.message); -} - -// Handle streaming status updates from server -function handleStreamingStatus(data) { - console.log('Streaming status:', data.status); - - if (data.status === 'stopped') { - // Reset UI if needed - if (state.isStreaming) { - stopStreaming(false); // Don't send to server since this came from server - } - } -} - -// Add a system message to the conversation -function addSystemMessage(message) { - const messageElement = document.createElement('div'); - messageElement.className = 'message system'; - messageElement.textContent = message; - elements.conversation.appendChild(messageElement); - - // Auto-scroll to bottom - elements.conversation.scrollTop = elements.conversation.scrollHeight; -} - -// Downsample audio buffer to target sample rate -function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) { - if (originalSampleRate === targetSampleRate) { - return buffer; - } - - const ratio = originalSampleRate / targetSampleRate; - const newLength = Math.round(buffer.length / ratio); - const result = new Float32Array(newLength); - - for (let i = 0; i < newLength; i++) { - const pos = Math.round(i * ratio); - result[i] = buffer[pos]; - } - - return result; -} - -// Handle processing status updates -function handleProcessingStatus(data) { - console.log('Processing status update:', data); - - // Show processing status in UI - if (data.status === 'generating_audio') { - elements.streamButton.innerHTML = ' Processing...'; - elements.streamButton.classList.add('processing'); - elements.streamButton.classList.remove('recording'); - - // Show message to user - addSystemMessage(data.message || 'Processing your request...'); - } -} - -// Handle the start of an audio streaming response -function handleAudioResponseStart(data) { - console.log('Audio response starting:', data); - - // Reset streaming audio state - streamingAudio.chunks = []; - streamingAudio.totalChunks = data.total_chunks; - streamingAudio.receivedChunks = 0; - streamingAudio.text = data.text; - streamingAudio.complete = false; - - // Create message container now, so we can update it as chunks arrive - const messageElement = document.createElement('div'); - messageElement.className = 'message ai processing'; - - // Add text content if available - if (data.text) { - const textElement = document.createElement('p'); - textElement.textContent = data.text; - messageElement.appendChild(textElement); - } - - // Create audio element (will be populated as chunks arrive) - const audioElement = document.createElement('audio'); - audioElement.controls = true; - audioElement.className = 'audio-player'; - audioElement.textContent = 'Audio is being generated...'; - messageElement.appendChild(audioElement); - - // Add timestamp - const timeElement = document.createElement('span'); - timeElement.className = 'message-time'; - timeElement.textContent = new Date().toLocaleTimeString(); - messageElement.appendChild(timeElement); - - // Add loading indicator - const loadingElement = document.createElement('div'); - loadingElement.className = 'loading-indicator'; - loadingElement.innerHTML = '
Generating audio response...'; - messageElement.appendChild(loadingElement); - - // Add to conversation - elements.conversation.appendChild(messageElement); - - // Auto-scroll to bottom - elements.conversation.scrollTop = elements.conversation.scrollHeight; - - // Store elements for later updates - streamingAudio.messageElement = messageElement; - streamingAudio.audioElement = audioElement; -} - -// Handle an incoming audio chunk -function handleAudioResponseChunk(data) { - console.log(`Received audio chunk ${data.chunk_index + 1}/${data.total_chunks}`); - - // Store the chunk - streamingAudio.chunks[data.chunk_index] = data.audio; - streamingAudio.receivedChunks++; - - // Update progress in the UI - if (streamingAudio.messageElement) { - const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator span'); - if (loadingElement) { - loadingElement.textContent = `Generating audio response... ${Math.round((streamingAudio.receivedChunks / data.total_chunks) * 100)}%`; - } - } - - // If this is the first chunk, start playing it immediately for faster response - if (data.chunk_index === 0 && streamingAudio.audioElement && elements.autoPlayResponses && elements.autoPlayResponses.checked) { - try { - streamingAudio.audioElement.src = data.audio; - streamingAudio.audioElement.play().catch(err => console.warn('Auto-play failed:', err)); - } catch (e) { - console.error('Error playing first chunk:', e); - } - } - - // If this is the last chunk or we've received all chunks, finalize the audio - if (data.is_last || streamingAudio.receivedChunks >= data.total_chunks) { - finalizeStreamingAudio(); - } -} - -// Handle completion of audio streaming -function handleAudioResponseComplete(data) { - console.log('Audio response complete:', data); - streamingAudio.complete = true; - - // Make sure we finalize the audio even if some chunks were missed - finalizeStreamingAudio(); - - // Update UI to normal state - if (state.isStreaming) { - elements.streamButton.innerHTML = ' Listening...'; - elements.streamButton.classList.add('recording'); - elements.streamButton.classList.remove('processing'); - } -} - -// Finalize streaming audio by combining chunks and updating the UI -function finalizeStreamingAudio() { - if (!streamingAudio.messageElement || streamingAudio.chunks.length === 0) { - return; - } - - try { - // For more sophisticated audio streaming, you would need to properly concatenate - // the WAV files, but for now we'll use the last chunk as the complete audio - // since it should contain the entire response due to how the server is implementing it - const lastChunkIndex = streamingAudio.chunks.length - 1; - const audioData = streamingAudio.chunks[lastChunkIndex] || streamingAudio.chunks[0]; - - // Update the audio element with the complete audio - if (streamingAudio.audioElement) { - streamingAudio.audioElement.src = audioData; - - // Auto-play if enabled and not already playing - if (elements.autoPlayResponses && elements.autoPlayResponses.checked && - streamingAudio.audioElement.paused) { - streamingAudio.audioElement.play() - .catch(err => { - console.warn('Auto-play failed:', err); - addSystemMessage('Auto-play failed. Please click play to hear the response.'); - }); - } - } - - // Remove loading indicator and processing class - if (streamingAudio.messageElement) { - const loadingElement = streamingAudio.messageElement.querySelector('.loading-indicator'); - if (loadingElement) { - streamingAudio.messageElement.removeChild(loadingElement); - } - streamingAudio.messageElement.classList.remove('processing'); - } - - console.log('Audio response finalized and ready for playback'); - } catch (e) { - console.error('Error finalizing streaming audio:', e); - } - - // Reset streaming audio state - streamingAudio.chunks = []; - streamingAudio.totalChunks = 0; - streamingAudio.receivedChunks = 0; - streamingAudio.messageElement = null; - streamingAudio.audioElement = null; -} - -// Add CSS styles for new UI elements -document.addEventListener('DOMContentLoaded', function() { - // Add styles for processing state - const style = document.createElement('style'); - style.textContent = ` - .message.processing { - opacity: 0.8; - } - - .loading-indicator { - display: flex; - align-items: center; - margin-top: 8px; - font-size: 0.9em; - color: #666; - } - - .loading-spinner { - width: 16px; - height: 16px; - border: 2px solid #ddd; - border-top: 2px solid var(--primary-color); - border-radius: 50%; - margin-right: 8px; - animation: spin 1s linear infinite; - } - - @keyframes spin { - 0% { transform: rotate(0deg); } - 100% { transform: rotate(360deg); } - } - `; - document.head.appendChild(style); -}); - -// Initialize the application when DOM is fully loaded -document.addEventListener('DOMContentLoaded', initializeApp); - diff --git a/React/src/app/auth/session/route.ts b/React/src/app/auth/session/route.ts new file mode 100644 index 0000000..9299d4a --- /dev/null +++ b/React/src/app/auth/session/route.ts @@ -0,0 +1,12 @@ +import { NextResponse } from "next/server"; +import { auth0 } from "../../../lib/auth0"; + +export async function GET() { + try { + const session = await auth0.getSession(); + return NextResponse.json({ session }); + } catch (error) { + console.error("Error getting session:", error); + return NextResponse.json({ session: null }, { status: 500 }); + } +} diff --git a/React/src/app/call/page.tsx b/React/src/app/call/page.tsx index 17c0c65..5ee8795 100644 --- a/React/src/app/call/page.tsx +++ b/React/src/app/call/page.tsx @@ -78,7 +78,7 @@ function CallPage() { "Content-Type": "application/json", }, body: JSON.stringify({ - message: `yo i need help`, + message: `John Smith needs help.`, }), }); diff --git a/React/src/app/page.tsx b/React/src/app/page.tsx index 29297d0..21e0862 100644 --- a/React/src/app/page.tsx +++ b/React/src/app/page.tsx @@ -1,40 +1,94 @@ "use client"; -import { useState } from "react"; -import { auth0 } from "../lib/auth0"; -import { NextApiRequest, NextApiResponse } from "next"; +import { useState, useEffect } from "react"; +import { useRouter } from "next/navigation"; - - - -export default async function Home() { - - +export default function Home() { const [contacts, setContacts] = useState([]); const [codeword, setCodeword] = useState(""); + const [session, setSession] = useState(null); + const [loading, setLoading] = useState(true); + const router = useRouter(); - const session = await auth0.getSession(); + useEffect(() => { + // Fetch session data from an API route + fetch("/auth/session") + .then((response) => response.json()) + .then((data) => { + setSession(data.session); + setLoading(false); + }) + .catch((error) => { + console.error("Failed to fetch session:", error); + setLoading(false); + }); + }, []); - console.log("Session:", session?.user); + function saveToDB() { + alert("Saving contacts..."); + const contactInputs = document.querySelectorAll( + ".text-input" + ) as NodeListOf; + const contactValues = Array.from(contactInputs).map((input) => input.value); + fetch("/api/databaseStorage", { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + email: session?.user?.email || "", + codeword: codeword, + contacts: contactValues, + }), + }) + .then((response) => { + if (response.ok) { + alert("Contacts saved successfully!"); + } else { + alert("Error saving contacts."); + } + }) + .catch((error) => { + console.error("Error:", error); + alert("Error saving contacts."); + }); + } + + if (loading) { + return
Loading...
; + } // If no session, show sign-up and login buttons - if (!session) { - + if (!session) { return (
- + - +
-

Fauxcall

-

Set emergency contacts

-

If you stop speaking or say the codeword, these contacts will be notified

+

+ Fauxcall +

+

+ Set emergency contacts +

+

+ If you stop speaking or say the codeword, these contacts will be + notified +

{/* form for setting codeword */} -
e.preventDefault()}> + e.preventDefault()} + > + className="bg-blue-500 text-white font-semibold font-lg rounded-md p-2" + type="submit" + > + Set codeword +
{/* form for adding contacts */} -
e.preventDefault()}> + e.preventDefault()} + > - +
); @@ -80,25 +145,42 @@ export default async function Home() {

Welcome, {session.user.name}!

- -

Fauxcall

-

Set emergency contacts

-

If you stop speaking or say the codeword, these contacts will be notified

- {/* form for setting codeword */} -
e.preventDefault()}> - setCodeword(e.target.value)} - placeholder="Codeword" - className="border border-gray-300 rounded-md p-2" - /> - -
- {/* form for adding contacts */} -
e.preventDefault()}> + type="submit" + > + Set codeword + +
+ {/* form for adding contacts */} +
e.preventDefault()} + > - - - -
- + + + + +

diff --git a/React/src/pages/api/databaseStorage.ts b/React/src/pages/api/databaseStorage.ts new file mode 100644 index 0000000..aa01d37 --- /dev/null +++ b/React/src/pages/api/databaseStorage.ts @@ -0,0 +1,56 @@ +import { NextApiRequest, NextApiResponse } from "next"; +import mongoose from "mongoose"; + +const uri = process.env.MONGODB_URI || "mongodb://localhost:27017/mydatabase"; +const clientOptions = { serverApi: { version: "1" as const, strict: true, deprecationErrors: true } }; + +// Create a reusable connection function +async function connectToDatabase() { + if (mongoose.connection.readyState === 0) { + // Only connect if not already connected + await mongoose.connect(uri, clientOptions); + console.log("Connected to MongoDB!"); + mongoose.model("User", new mongoose.Schema({ + email: { type: String, required: true, unique: true }, + codeword: { type: String, required: true }, + contacts: [{ type: String }], + })); + } +} + +export default async function handler(req: NextApiRequest, res: NextApiResponse) { + try { + // Ensure the database is connected + await connectToDatabase(); + + + if (req.method === 'POST') { + const { email, codeword, contacts } = req.body; + + // Perform database operations here + // query database to see if document with email exists + const existingUser = await mongoose.model('User').findOne({ email }); + if (existingUser) { + // If user exists, update their codeword and contacts + await mongoose.model('User').updateOne({ email }, { codeword, contacts }); + } else { + // If user does not exist, create a new user + const User = mongoose.model('User'); + const newUser = new User({ email, codeword, contacts }); + await newUser.save(); + } + + + console.log("Codeword:", codeword); + console.log("Contacts:", contacts); + + res.status(200).json({ success: true, message: "Data saved successfully!" }); + } else { + res.setHeader('Allow', ['POST']); + res.status(405).end(`Method ${req.method} Not Allowed`); + } + } catch (error) { + console.error("Error:", error); + res.status(500).json({ success: false, error: "Internal Server Error" }); + } +} \ No newline at end of file