Merge branch 'main' of https://github.com/GamerBoss101/HooHacks-12

2025-03-30 07:38:28 -04:00
parent 56d4807379 30388d816f
commit 098e3e469e
14 changed files with 1203 additions and 2700 deletions
--- a/Backend/.gitignore
+++ b/Backend/.gitignore
@@ -1,46 +0,0 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual Environment
 .env
 .venv
 env/
 venv/
 ENV/
 # IDE
 .idea/
 .vscode/
 *.swp
 *.swo
 # Project specific
 .python-version
 *.wav
 output_*/
 basic_audio.wav
 full_conversation.wav
 context_audio.wav
 # Model files
 *.pt
 *.ckpt
--- a/Backend/README.md
+++ b/Backend/README.md
@@ -1,154 +0,0 @@
 # CSM
 **2025/03/13** - We are releasing the 1B CSM variant. The checkpoint is [hosted on Hugging Face](https://huggingface.co/sesame/csm_1b).
 ---
 CSM (Conversational Speech Model) is a speech generation model from [Sesame](https://www.sesame.com) that generates RVQ audio codes from text and audio inputs. The model architecture employs a [Llama](https://www.llama.com/) backbone and a smaller audio decoder that produces [Mimi](https://huggingface.co/kyutai/mimi) audio codes.
 A fine-tuned variant of CSM powers the [interactive voice demo](https://www.sesame.com/voicedemo) shown in our [blog post](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice).
 A hosted [Hugging Face space](https://huggingface.co/spaces/sesame/csm-1b) is also available for testing audio generation.
 ## Requirements
 * A CUDA-compatible GPU
 * The code has been tested on CUDA 12.4 and 12.6, but it may also work on other versions
 * Similarly, Python 3.10 is recommended, but newer versions may be fine
 * For some audio operations, `ffmpeg` may be required
 * Access to the following Hugging Face models:
  * [Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)
  * [CSM-1B](https://huggingface.co/sesame/csm-1b)
 ### Setup
 ```bash
 git clone git@github.com:SesameAILabs/csm.git
 cd csm
 python3.10 -m venv .venv
 source .venv/bin/activate
 pip install -r requirements.txt
 # Disable lazy compilation in Mimi
 export NO_TORCH_COMPILE=1
 # You will need access to CSM-1B and Llama-3.2-1B
 huggingface-cli login
 ```
 ### Windows Setup
 The `triton` package cannot be installed in Windows. Instead use `pip install triton-windows`.
 ## Quickstart
 This script will generate a conversation between 2 characters, using a prompt for each character.
 ```bash
 python run_csm.py
 ```
 ## Usage
 If you want to write your own applications with CSM, the following examples show basic usage.
 #### Generate a sentence
 This will use a random speaker identity, as no prompt or context is provided.
 ```python
 from generator import load_csm_1b
 import torchaudio
 import torch
 if torch.backends.mps.is_available():
    device = "mps"
 elif torch.cuda.is_available():
    device = "cuda"
 else:
    device = "cpu"
 generator = load_csm_1b(device=device)
 audio = generator.generate(
    text="Hello from Sesame.",
    speaker=0,
    context=[],
    max_audio_length_ms=10_000,
 )
 torchaudio.save("audio.wav", audio.unsqueeze(0).cpu(), generator.sample_rate)
 ```
 #### Generate with context
 CSM sounds best when provided with context. You can prompt or provide context to the model using a `Segment` for each speaker's utterance.
 NOTE: The following example is instructional and the audio files do not exist. It is intended as an example for using context with CSM.
 ```python
 from generator import Segment
 speakers = [0, 1, 0, 0]
 transcripts = [
    "Hey how are you doing.",
    "Pretty good, pretty good.",
    "I'm great.",
    "So happy to be speaking to you.",
 ]
 audio_paths = [
    "utterance_0.wav",
    "utterance_1.wav",
    "utterance_2.wav",
    "utterance_3.wav",
 ]
 def load_audio(audio_path):
    audio_tensor, sample_rate = torchaudio.load(audio_path)
    audio_tensor = torchaudio.functional.resample(
        audio_tensor.squeeze(0), orig_freq=sample_rate, new_freq=generator.sample_rate
    )
    return audio_tensor
 segments = [
    Segment(text=transcript, speaker=speaker, audio=load_audio(audio_path))
    for transcript, speaker, audio_path in zip(transcripts, speakers, audio_paths)
 ]
 audio = generator.generate(
    text="Me too, this is some cool stuff huh?",
    speaker=1,
    context=segments,
    max_audio_length_ms=10_000,
 )
 torchaudio.save("audio.wav", audio.unsqueeze(0).cpu(), generator.sample_rate)
 ```
 ## FAQ
 **Does this model come with any voices?**
 The model open-sourced here is a base generation model. It is capable of producing a variety of voices, but it has not been fine-tuned on any specific voice.
 **Can I converse with the model?**
 CSM is trained to be an audio generation model and not a general-purpose multimodal LLM. It cannot generate text. We suggest using a separate LLM for text generation.
 **Does it support other languages?**
 The model has some capacity for non-English languages due to data contamination in the training data, but it likely won't do well.
 ## Misuse and abuse ⚠️
 This project provides a high-quality speech generation model for research and educational purposes. While we encourage responsible and ethical use, we **explicitly prohibit** the following:
 - **Impersonation or Fraud**: Do not use this model to generate speech that mimics real individuals without their explicit consent.
 - **Misinformation or Deception**: Do not use this model to create deceptive or misleading content, such as fake news or fraudulent calls.
 - **Illegal or Harmful Activities**: Do not use this model for any illegal, harmful, or malicious purposes.
 By using this model, you agree to comply with all applicable laws and ethical guidelines. We are **not responsible** for any misuse, and we strongly condemn unethical applications of this technology.
 ---
 ## Authors
 Johan Schalkwyk, Ankit Kumar, Dan Lyth, Sefik Emre Eskimez, Zack Hodari, Cinjon Resnick, Ramon Sanabria, Raven Jiang, and the Sesame team.
--- a/Backend/app.py
+++ b/Backend/app.py
@@ -0,0 +1,229 @@
 import os
 import io
 import base64
 import time
 import torch
 import torchaudio
 import numpy as np
 from flask import Flask, render_template, request
 from flask_socketio import SocketIO, emit
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import speech_recognition as sr
 from generator import load_csm_1b, Segment
 from collections import deque
 app = Flask(__name__)
 app.config['SECRET_KEY'] = 'your-secret-key'
 socketio = SocketIO(app, cors_allowed_origins="*")
 # Select the best available device
 if torch.cuda.is_available():
    device = "cuda"
 elif torch.backends.mps.is_available():
    device = "mps"
 else:
    device = "cpu"
 print(f"Using device: {device}")
 # Initialize CSM model for audio generation
 print("Loading CSM model...")
 csm_generator = load_csm_1b(device=device)
 # Initialize Llama 3.2 model for response generation
 print("Loading Llama 3.2 model...")
 llm_model_id = "meta-llama/Llama-3.2-1B"  # Choose appropriate size based on resources
 llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
 llm_model = AutoModelForCausalLM.from_pretrained(
    llm_model_id,
    torch_dtype=torch.bfloat16,
    device_map=device
 )
 # Initialize speech recognition
 recognizer = sr.Recognizer()
 # Store conversation context
 conversation_context = {}  # session_id -> context
@app.route('/')
 def index():
    return render_template('index.html')
@socketio.on('connect')
 def handle_connect():
    print(f"Client connected: {request.sid}")
    conversation_context[request.sid] = {
        'segments': [],
        'speakers': [0, 1],  # 0 = user, 1 = bot
        'audio_buffer': deque(maxlen=10),  # Store recent audio chunks
        'is_speaking': False,
        'silence_start': None
    }
    emit('ready', {'message': 'Connection established'})
@socketio.on('disconnect')
 def handle_disconnect():
    print(f"Client disconnected: {request.sid}")
    if request.sid in conversation_context:
        del conversation_context[request.sid]
@socketio.on('start_speaking')
 def handle_start_speaking():
    if request.sid in conversation_context:
        conversation_context[request.sid]['is_speaking'] = True
        conversation_context[request.sid]['audio_buffer'].clear()
        print(f"User {request.sid} started speaking")
@socketio.on('audio_chunk')
 def handle_audio_chunk(data):
    if request.sid not in conversation_context:
        return
    context = conversation_context[request.sid]
    # Decode audio data
    audio_data = base64.b64decode(data['audio'])
    audio_numpy = np.frombuffer(audio_data, dtype=np.float32)
    audio_tensor = torch.tensor(audio_numpy)
    # Add to buffer
    context['audio_buffer'].append(audio_tensor)
    # Check for silence to detect end of speech
    if context['is_speaking'] and is_silence(audio_tensor):
        if context['silence_start'] is None:
            context['silence_start'] = time.time()
        elif time.time() - context['silence_start'] > 1.0:  # 1 second of silence
            # Process the complete utterance
            process_user_utterance(request.sid)
    else:
        context['silence_start'] = None
@socketio.on('stop_speaking')
 def handle_stop_speaking():
    if request.sid in conversation_context:
        conversation_context[request.sid]['is_speaking'] = False
        process_user_utterance(request.sid)
        print(f"User {request.sid} stopped speaking")
 def is_silence(audio_tensor, threshold=0.02):
    """Check if an audio chunk is silence based on amplitude threshold"""
    return torch.mean(torch.abs(audio_tensor)) < threshold
 def process_user_utterance(session_id):
    """Process completed user utterance, generate response and send audio back"""
    context = conversation_context[session_id]
    if not context['audio_buffer']:
        return
    # Combine audio chunks
    full_audio = torch.cat(list(context['audio_buffer']), dim=0)
    context['audio_buffer'].clear()
    context['is_speaking'] = False
    context['silence_start'] = None
    # Convert audio to 16kHz for speech recognition
    audio_16k = torchaudio.functional.resample(
        full_audio, 
        orig_freq=44100,  # Assuming 44.1kHz from client
        new_freq=16000
    )
    # Transcribe speech
    try:
        # Convert to wav format for speech_recognition
        audio_data = io.BytesIO()
        torchaudio.save(audio_data, audio_16k.unsqueeze(0), 16000, format="wav")
        audio_data.seek(0)
        with sr.AudioFile(audio_data) as source:
            audio = recognizer.record(source)
            user_text = recognizer.recognize_google(audio)
            print(f"Transcribed: {user_text}")
            # Add to conversation segments
            user_segment = Segment(
                text=user_text,
                speaker=0,  # User is speaker 0
                audio=full_audio
            )
            context['segments'].append(user_segment)
            # Generate bot response
            bot_response = generate_llm_response(user_text, context['segments'])
            print(f"Bot response: {bot_response}")
            # Convert to audio using CSM
            bot_audio = generate_audio_response(bot_response, context['segments'])
            # Convert audio to base64 for sending over websocket
            audio_bytes = io.BytesIO()
            torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav")
            audio_bytes.seek(0)
            audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
            # Add bot response to conversation history
            bot_segment = Segment(
                text=bot_response,
                speaker=1,  # Bot is speaker 1
                audio=bot_audio
            )
            context['segments'].append(bot_segment)
            # Send transcribed text to client
            emit('transcription', {'text': user_text}, room=session_id)
            # Send audio response to client
            emit('audio_response', {
                'audio': audio_b64,
                'text': bot_response
            }, room=session_id)
    except Exception as e:
        print(f"Error processing speech: {e}")
        emit('error', {'message': f'Error processing speech: {str(e)}'}, room=session_id)
 def generate_llm_response(user_text, conversation_segments):
    """Generate text response using Llama 3.2"""
    # Format conversation history for the LLM
    conversation_history = ""
    for segment in conversation_segments[-5:]:  # Use last 5 utterances for context
        speaker_name = "User" if segment.speaker == 0 else "Assistant"
        conversation_history += f"{speaker_name}: {segment.text}\n"
    # Add the current user query
    conversation_history += f"User: {user_text}\nAssistant:"
    # Generate response
    inputs = llm_tokenizer(conversation_history, return_tensors="pt").to(device)
    output = llm_model.generate(
        inputs.input_ids, 
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response.strip()
 def generate_audio_response(text, conversation_segments):
    """Generate audio response using CSM"""
    # Use the last few conversation segments as context
    context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments
    # Generate audio for bot response
    audio = csm_generator.generate(
        text=text,
        speaker=1,  # Bot is speaker 1
        context=context_segments,
        max_audio_length_ms=10000,  # 10 seconds max
        temperature=0.9,
        topk=50
    )
    return audio
 if __name__ == '__main__':
    socketio.run(app, host='0.0.0.0', port=5000, debug=True)
--- a/Backend/index.html
+++ b/Backend/index.html
@@ -3,490 +3,454 @@
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Sesame AI Voice Chat</title>
+    <title>Voice Assistant - CSM & Whisper</title>
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
    <!-- Socket.IO client library -->
    <script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
    <style>
        :root {
            --primary-color: #4c84ff;
            --secondary-color: #3367d6;
            --text-color: #333;
            --background-color: #f9f9f9;
            --card-background: #ffffff;
            --accent-color: #ff5252;
            --success-color: #4CAF50;
            --border-color: #e0e0e0;
            --shadow-color: rgba(0, 0, 0, 0.1);
        }
        * {
            box-sizing: border-box;
            margin: 0;
            padding: 0;
        }
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-            background-color: var(--background-color);
+            max-width: 800px;
            color: var(--text-color);
            line-height: 1.6;
            max-width: 1000px;
            margin: 0 auto;
            padding: 20px;
-            transition: all 0.3s ease;
+            background-color: #f5f7fa;
            color: #333;
        }
-        header {
+        h1 {
            color: #2c3e50;
            text-align: center;
            margin-bottom: 30px;
        }
-        h1 {
+        #conversation {
-            color: var(--primary-color);
+            height: 400px;
-            font-size: 2.5rem;
+            border: 1px solid #ddd;
-            margin-bottom: 10px;
+            border-radius: 10px;
        }
        .subtitle {
            color: #666;
            font-weight: 300;
        }
        .app-container {
            display: grid;
            grid-template-columns: 1fr;
            gap: 20px;
        }
        @media (min-width: 768px) {
            .app-container {
                grid-template-columns: 1fr 1fr;
            }
        }
        .chat-container, .control-panel {
            background-color: var(--card-background);
            border-radius: 12px;
            box-shadow: 0 4px 12px var(--shadow-color);
            padding: 20px;
            margin-bottom: 20px;
            overflow-y: auto;
            background-color: white;
            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
        }
-        .control-panel {
+        .message-container {
            display: flex;
            flex-direction: column;
            gap: 20px;
        }
        .chat-header {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 15px;
            padding-bottom: 10px;
            border-bottom: 1px solid var(--border-color);
        }
-        .conversation {
+        .user-message-container {
-            height: 400px;
+            align-items: flex-end;
-            overflow-y: auto;
+        }
-            padding: 10px;
+        
-            border-radius: 8px;
+        .bot-message-container {
-            background-color: #f7f9fc;
+            align-items: flex-start;
            margin-bottom: 20px;
            scroll-behavior: smooth;
        }
        .message {
-            margin-bottom: 15px;
+            max-width: 80%;
-            padding: 12px 15px;
+            padding: 12px;
-            border-radius: 12px;
+            border-radius: 18px;
            max-width: 85%;
            position: relative;
-            animation: fade-in 0.3s ease-out forwards;
+            word-break: break-word;
        }
-        @keyframes fade-in {
+        .user-message {
-            from { opacity: 0; transform: translateY(10px); }
+            background-color: #dcf8c6;
-            to { opacity: 1; transform: translateY(0); }
+            color: #000;
        }
        .user {
            background-color: #e3f2fd;
            color: #0d47a1;
            margin-left: auto;
            border-bottom-right-radius: 4px;
        }
-        .ai {
+        .bot-message {
-            background-color: #f1f1f1;
+            background-color: #f1f0f0;
-            color: #37474f;
+            color: #000;
            margin-right: auto;
            border-bottom-left-radius: 4px;
        }
-        .system {
+        .message-label {
-            background-color: #f8f9fa;
+            font-size: 0.8em;
-            font-style: italic;
+            margin-bottom: 4px;
-            color: #666;
+            color: #657786;
            text-align: center;
            max-width: 90%;
            margin: 10px auto;
            font-size: 0.9em;
            padding: 8px 12px;
            border-radius: 8px;
        }
-        .message-time {
+        #controls {
            font-size: 0.7em;
            color: #999;
            position: absolute;
            bottom: 5px;
            right: 10px;
        }
        .audio-player {
            width: 100%;
            margin-top: 8px;
            border-radius: 8px;
        }
        .visualizer-section {
            margin-bottom: 20px;
        }
        .visualizer-container {
            height: 150px;
            background-color: #000;
            border-radius: 12px;
            overflow: hidden;
            position: relative;
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
        }
        .visualizer-label {
            position: absolute;
            top: 50%;
            left: 50%;
            transform: translate(-50%, -50%);
            color: rgba(255, 255, 255, 0.7);
            font-size: 1rem;
            text-align: center;
            pointer-events: none;
            transition: opacity 0.3s ease;
            z-index: 1;
        }
        #audioVisualizer {
            width: 100%;
            height: 100%;
            display: block;
        }
        .controls {
            display: flex;
            gap: 15px;
            flex-wrap: wrap;
        }
        .control-group {
            flex: 1;
            min-width: 200px;
        }
        .control-label {
            font-weight: 600;
            margin-bottom: 10px;
            color: #555;
        }
        .button-row {
            display: flex;
            gap: 10px;
            margin-top: 15px;
        }
        button {
            padding: 12px 20px;
            border-radius: 8px;
            border: none;
            background-color: var(--primary-color);
            color: white;
            font-weight: 600;
            cursor: pointer;
            transition: all 0.2s ease;
            display: flex;
            align-items: center;
            justify-content: center;
            gap: 8px;
            flex: 1;
        }
        button:hover {
            background-color: var(--secondary-color);
            transform: translateY(-2px);
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        }
        button:active {
            transform: translateY(0);
        }
        button.recording {
            background-color: var(--accent-color);
            animation: pulse 1.5s infinite;
        }
        button.processing {
            background-color: #ff9800;
        }
        @keyframes pulse {
            0% { opacity: 1; }
            50% { opacity: 0.8; }
            100% { opacity: 1; }
        }
        select, .slider-container {
            width: 100%;
            padding: 10px;
            border-radius: 8px;
            border: 1px solid var(--border-color);
            background-color: white;
            margin-bottom: 15px;
        }
-        .slider-container {
+        button {
-            display: flex;
+            padding: 12px 24px;
-            flex-direction: column;
+            font-size: 16px;
            gap: 5px;
        }
        .slider-label {
            display: flex;
            justify-content: space-between;
        }
        input[type="range"] {
            width: 100%;
            cursor: pointer;
            border-radius: 50px;
            border: none;
            outline: none;
            transition: all 0.3s ease;
        }
-        .volume-indicator {
+        #recordButton {
-            height: 30px;
+            background-color: #4CAF50;
-            background: linear-gradient(to right, #4CAF50, #FFEB3B, #F44336);
+            color: white;
-            border-radius: 4px;
+            width: 200px;
-            margin-top: 5px;
+            box-shadow: 0 4px 8px rgba(76, 175, 80, 0.3);
            position: relative;
            overflow: hidden;
        }
-        .volume-level {
+        #recordButton:hover {
-            height: 100%;
+            background-color: #45a049;
-            width: 0%;
+            transform: translateY(-2px);
            background-color: rgba(0, 0, 0, 0.5);
            position: absolute;
            right: 0;
            top: 0;
            transition: width 0.1s ease;
        }
-        .status-indicator {
+        #recordButton.recording {
-            display: flex;
+            background-color: #f44336;
-            align-items: center;
+            animation: pulse 1.5s infinite;
-            gap: 8px;
+            box-shadow: 0 4px 8px rgba(244, 67, 54, 0.3);
            padding: 10px;
            border-radius: 8px;
            background-color: #f5f5f5;
            margin-top: 20px;
        }
-        .status-dot {
+        @keyframes pulse {
-            width: 12px;
+            0% {
-            height: 12px;
+                transform: scale(1);
-            border-radius: 50%;
+            }
-            background-color: #ccc;
+            50% {
-            transition: background-color 0.3s ease;
+                transform: scale(1.05);
            }
            100% {
                transform: scale(1);
            }
        }
-        .status-dot.active {
+        #status {
            background-color: var(--success-color);
        }
        .status-text {
            font-size: 0.9em;
            color: #666;
        }
        /* Custom Scrollbar */
        .conversation::-webkit-scrollbar {
            width: 8px;
        }
        .conversation::-webkit-scrollbar-track {
            background: #f1f1f1;
            border-radius: 10px;
        }
        .conversation::-webkit-scrollbar-thumb {
            background: #ccc;
            border-radius: 10px;
        }
        .conversation::-webkit-scrollbar-thumb:hover {
            background: #aaa;
        }
        /* Settings Panel */
        .settings-panel {
            margin-top: 20px;
        }
        .settings-toggles {
            display: grid;
            grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
            gap: 10px;
            margin-top: 10px;
        }
        .toggle-switch {
            display: flex;
            align-items: center;
        }
        .toggle-switch input {
            opacity: 0;
            width: 0;
            height: 0;
        }
        .toggle-switch label {
            position: relative;
            display: inline-block;
            width: 50px;
            height: 24px;
            background-color: #ccc;
            border-radius: 34px;
            transition: .4s;
            margin-right: 10px;
            cursor: pointer;
        }
        .toggle-switch label:before {
            position: absolute;
            content: "";
            height: 16px;
            width: 16px;
            left: 4px;
            bottom: 4px;
            background-color: white;
            transition: .4s;
            border-radius: 50%;
        }
        .toggle-switch input:checked + label {
            background-color: var(--primary-color);
        }
        .toggle-switch input:checked + label:before {
            transform: translateX(26px);
        }
        footer {
            text-align: center;
-            margin-top: 40px;
+            margin-top: 15px;
-            padding-top: 20px;
+            font-style: italic;
-            border-top: 1px solid var(--border-color);
+            color: #657786;
        }
        .audio-wave {
            display: flex;
            justify-content: center;
            align-items: center;
            height: 40px;
            gap: 3px;
        }
        .audio-wave span {
            display: block;
            width: 3px;
            height: 100%;
            background-color: #4CAF50;
            animation: wave 1.5s infinite ease-in-out;
            border-radius: 6px;
        }
        .audio-wave span:nth-child(2) {
            animation-delay: 0.2s;
        }
        .audio-wave span:nth-child(3) {
            animation-delay: 0.4s;
        }
        .audio-wave span:nth-child(4) {
            animation-delay: 0.6s;
        }
        .audio-wave span:nth-child(5) {
            animation-delay: 0.8s;
        }
        @keyframes wave {
            0%, 100% {
                height: 8px;
            }
            50% {
                height: 30px;
            }
        }
        .hidden {
            display: none;
        }
        .transcription-info {
            font-size: 0.8em;
            color: #888;
-            font-size: 0.9em;
+            margin-top: 4px;
            text-align: right;
        }
    </style>
 </head>
 <body>
-    <header>
+    <h1>Voice Assistant with CSM & Whisper</h1>
-        <h1>Sesame AI Voice Chat</h1>
+    <div id="conversation"></div>
        <p class="subtitle">Speak naturally and have a conversation with AI</p>
    </header>
-    <div class="app-container">
+    <div id="controls">
-        <div class="chat-container">
+        <button id="recordButton">Hold to Speak</button>
            <div class="chat-header">
                <h2>Conversation</h2>
                <button id="clearButton" class="small-button">
                    <i class="fas fa-trash"></i> Clear Chat
                </button>
            </div>
            <div class="conversation" id="conversation"></div>
    </div>
-        <div class="control-panel">
+    <div id="audioWave" class="audio-wave hidden">
-            <div class="visualizer-section">
+        <span></span>
-                <h3>Audio Visualizer</h3>
+        <span></span>
-                <div class="visualizer-container">
+        <span></span>
-                    <canvas id="audioVisualizer"></canvas>
+        <span></span>
-                    <div id="visualizerLabel" class="visualizer-label">Speak to see audio visualization</div>
+        <span></span>
                </div>
    </div>
-            <div class="controls">
+    <div id="status">Connecting to server...</div>
                <div class="control-group">
                    <div class="control-label">Voice Settings</div>
                    <select id="speakerSelect">
                        <option value="0">Speaker 0 (You)</option>
                        <option value="1">Speaker 1 (AI)</option>
                    </select>
-                    <div class="slider-container">
+    <script>
-                        <div class="slider-label">
+        const socket = io();
-                            <span>Silence Threshold</span>
+        const recordButton = document.getElementById('recordButton');
-                            <span id="thresholdValue">0.01</span>
+        const conversation = document.getElementById('conversation');
-                        </div>
+        const status = document.getElementById('status');
-                        <input type="range" id="thresholdSlider" min="0.001" max="0.1" step="0.001" value="0.01">
+        const audioWave = document.getElementById('audioWave');
                    </div>
-                    <div class="volume-indicator">
+        let mediaRecorder;
-                        <div id="volumeLevel" class="volume-level"></div>
+        let audioChunks = [];
-                    </div>
+        let isRecording = false;
-                </div>
+        let audioSendInterval;
        let sessionActive = false;
-                <div class="control-group">
+        // Initialize audio context
-                    <div class="control-label">Conversation Controls</div>
+        const audioContext = new (window.AudioContext || window.webkitAudioContext)();
                    <div class="button-row">
                        <button id="streamButton" class="main-button">
                            <i class="fas fa-microphone"></i> Start Conversation
                        </button>
                    </div>
                </div>
            </div>
-            <div class="settings-panel">
+        // Connect to server
-                <div class="control-label">Settings</div>
+        socket.on('connect', () => {
-                <div class="settings-toggles">
+            status.textContent = 'Connected to server';
-                    <div class="toggle-switch">
+            sessionActive = true;
-                        <input type="checkbox" id="autoPlayResponses" checked>
+        });
                        <label for="autoPlayResponses"></label>
                        <span>Auto-play responses</span>
                    </div>
                    <div class="toggle-switch">
                        <input type="checkbox" id="showVisualizer" checked>
                        <label for="showVisualizer"></label>
                        <span>Show visualizer</span>
                    </div>
                </div>
            </div>
-            <div class="status-indicator">
+        socket.on('disconnect', () => {
-                <div class="status-dot" id="statusDot"></div>
+            status.textContent = 'Disconnected from server';
-                <div class="status-text" id="statusText">Not connected</div>
+            sessionActive = false;
-            </div>
+        });
        </div>
    </div>
-    <footer>
+        socket.on('ready', (data) => {
-        <p>Powered by Sesame AI | WhisperX for speech recognition</p>
+            status.textContent = data.message;
-    </footer>
+            setupAudioRecording();
        });
-    <!-- Load our JavaScript file -->
+        socket.on('transcription', (data) => {
-    <script src="./voice-chat.js"></script>
+            addMessage('user', data.text);
            status.textContent = 'Assistant is thinking...';
        });
        socket.on('audio_response', (data) => {
            // Play audio
            status.textContent = 'Playing response...';
            const audio = new Audio('data:audio/wav;base64,' + data.audio);
            audio.onended = () => {
                status.textContent = 'Ready to record';
            };
            audio.onerror = () => {
                status.textContent = 'Error playing audio';
                console.error('Error playing audio response');
            };
            audio.play().catch(err => {
                status.textContent = 'Error playing audio: ' + err.message;
                console.error('Error playing audio:', err);
            });
            // Display text
            addMessage('bot', data.text);
        });
        socket.on('error', (data) => {
            status.textContent = 'Error: ' + data.message;
            console.error('Server error:', data.message);
        });
        function setupAudioRecording() {
            // Check if browser supports required APIs
            if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
                status.textContent = 'Your browser does not support audio recording';
                return;
            }
            // Get user media
            navigator.mediaDevices.getUserMedia({ audio: true })
                .then(stream => {
                    // Setup recording with better audio quality
                    const options = { 
                        mimeType: 'audio/webm',
                        audioBitsPerSecond: 128000 
                    };
                    try {
                        mediaRecorder = new MediaRecorder(stream, options);
                    } catch (e) {
                        // Fallback if the specified options aren't supported
                        mediaRecorder = new MediaRecorder(stream);
                    }
                    mediaRecorder.ondataavailable = event => {
                        if (event.data.size > 0) {
                            audioChunks.push(event.data);
                        }
                    };
                    mediaRecorder.onstop = () => {
                        processRecording();
                    };
                    // Create audio analyzer for visualization
                    const source = audioContext.createMediaStreamSource(stream);
                    const analyzer = audioContext.createAnalyser();
                    analyzer.fftSize = 2048;
                    source.connect(analyzer);
                    // Setup button handlers with better touch handling
                    recordButton.addEventListener('mousedown', startRecording);
                    recordButton.addEventListener('touchstart', (e) => {
                        e.preventDefault(); // Prevent default touch behavior
                        startRecording();
                    });
                    recordButton.addEventListener('mouseup', stopRecording);
                    recordButton.addEventListener('touchend', (e) => {
                        e.preventDefault();
                        stopRecording();
                    });
                    recordButton.addEventListener('mouseleave', stopRecording);
                    status.textContent = 'Ready to record';
                })
                .catch(err => {
                    status.textContent = 'Error accessing microphone: ' + err.message;
                    console.error('Error accessing microphone:', err);
                });
        }
        function startRecording() {
            if (!isRecording && sessionActive) {
                audioChunks = [];
                mediaRecorder.start(100); // Collect data in 100ms chunks
                recordButton.classList.add('recording');
                recordButton.textContent = 'Release to Stop';
                status.textContent = 'Recording...';
                audioWave.classList.remove('hidden');
                isRecording = true;
                socket.emit('start_speaking');
                // Start sending audio chunks periodically
                audioSendInterval = setInterval(() => {
                    if (mediaRecorder.state === 'recording') {
                        mediaRecorder.requestData(); // Force ondataavailable to fire
                    }
                }, 300); // Send every 300ms
            }
        }
        function stopRecording() {
            if (isRecording) {
                clearInterval(audioSendInterval);
                mediaRecorder.stop();
                recordButton.classList.remove('recording');
                recordButton.textContent = 'Hold to Speak';
                status.textContent = 'Processing speech...';
                audioWave.classList.add('hidden');
                isRecording = false;
            }
        }
        function processRecording() {
            if (audioChunks.length === 0) {
                status.textContent = 'No audio recorded';
                return;
            }
            const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
            // Convert to ArrayBuffer for processing
            const fileReader = new FileReader();
            fileReader.onloadend = () => {
                try {
                    const arrayBuffer = fileReader.result;
                    // Convert to Float32Array - this works better with WebAudio API
                    const audioData = convertToFloat32(arrayBuffer);
                    // Convert to base64 for sending
                    const base64String = arrayBufferToBase64(audioData.buffer);
                    socket.emit('audio_chunk', { audio: base64String });
                    // Signal end of speech
                    socket.emit('stop_speaking');
                } catch (e) {
                    console.error('Error processing audio:', e);
                    status.textContent = 'Error processing audio';
                }
            };
            fileReader.onerror = () => {
                status.textContent = 'Error reading audio data';
            };
            fileReader.readAsArrayBuffer(audioBlob);
        }
        function convertToFloat32(arrayBuffer) {
            // Get raw audio data as Int16 (common format for audio)
            const int16Array = new Int16Array(arrayBuffer);
            // Convert to Float32 (normalize between -1 and 1)
            const float32Array = new Float32Array(int16Array.length);
            for (let i = 0; i < int16Array.length; i++) {
                float32Array[i] = int16Array[i] / 32768.0;
            }
            return float32Array;
        }
        function addMessage(sender, text) {
            const containerDiv = document.createElement('div');
            containerDiv.className = sender === 'user' ? 'message-container user-message-container' : 'message-container bot-message-container';
            const labelDiv = document.createElement('div');
            labelDiv.className = 'message-label';
            labelDiv.textContent = sender === 'user' ? 'You' : 'Assistant';
            containerDiv.appendChild(labelDiv);
            const messageDiv = document.createElement('div');
            messageDiv.className = sender === 'user' ? 'message user-message' : 'message bot-message';
            messageDiv.textContent = text;
            containerDiv.appendChild(messageDiv);
            if (sender === 'user') {
                const infoDiv = document.createElement('div');
                infoDiv.className = 'transcription-info';
                infoDiv.textContent = 'Transcribed with Whisper';
                containerDiv.appendChild(infoDiv);
            }
            conversation.appendChild(containerDiv);
            conversation.scrollTop = conversation.scrollHeight;
        }
        function arrayBufferToBase64(buffer) {
            let binary = '';
            const bytes = new Uint8Array(buffer);
            const len = bytes.byteLength;
            for (let i = 0; i < len; i++) {
                binary += String.fromCharCode(bytes[i]);
            }
            return window.btoa(binary);
        }
        // Handle page visibility change to avoid issues with background tabs
        document.addEventListener('visibilitychange', () => {
            if (document.hidden && isRecording) {
                stopRecording();
            }
        });
        // Clean disconnection when page is closed
        window.addEventListener('beforeunload', () => {
            if (socket && socket.connected) {
                socket.disconnect();
            }
        });
    </script>
 </body>
 </html>
--- a/Backend/req.txt
+++ b/Backend/req.txt
@@ -0,0 +1 @@
 pip install faster-whisper
--- a/Backend/requirements.txt
+++ b/Backend/requirements.txt
@@ -1,9 +0,0 @@
 torch==2.4.0
 torchaudio==2.4.0
 tokenizers==0.21.0
 transformers==4.49.0
 huggingface_hub==0.28.1
 moshi==0.2.2
 torchtune==0.4.0
 torchao==0.9.0
 silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master
--- a/Backend/server.py
+++ b/Backend/server.py
--- a/Backend/setup.py
+++ b/Backend/setup.py
@@ -1,13 +0,0 @@
 from setuptools import setup, find_packages
 import os
 # Read requirements from requirements.txt
 with open('requirements.txt') as f:
    requirements = [line.strip() for line in f if line.strip() and not line.startswith('#')]
 setup(
    name='csm',
    version='0.1.0',
    packages=find_packages(),
    install_requires=requirements,
 )
--- a/Backend/test.py
+++ b/Backend/test.py
@@ -1,50 +0,0 @@
 import os
 import torch
 import torchaudio
 from huggingface_hub import hf_hub_download
 from generator import load_csm_1b, Segment
 from dataclasses import dataclass
 if torch.backends.mps.is_available():
    device = "mps"
 elif torch.cuda.is_available():
    device = "cuda"
 else:
    device = "cpu"
 generator = load_csm_1b(device=device)
 speakers = [0, 1, 0, 0]
 transcripts = [
    "Hey how are you doing.",
    "Pretty good, pretty good.",
    "I'm great.",
    "So happy to be speaking to you.",
 ]
 audio_paths = [
    "utterance_0.wav",
    "utterance_1.wav",
    "utterance_2.wav",
    "utterance_3.wav",
 ]
 def load_audio(audio_path):
    audio_tensor, sample_rate = torchaudio.load(audio_path)
    audio_tensor = torchaudio.functional.resample(
        audio_tensor.squeeze(0), orig_freq=sample_rate, new_freq=generator.sample_rate
    )
    return audio_tensor
 segments = [
    Segment(text=transcript, speaker=speaker, audio=load_audio(audio_path))
    for transcript, speaker, audio_path in zip(transcripts, speakers, audio_paths)
 ]
 audio = generator.generate(
    text="Me too, this is some cool stuff huh?",
    speaker=1,
    context=segments,
    max_audio_length_ms=10_000,
 )
 torchaudio.save("audio.wav", audio.unsqueeze(0).cpu(), generator.sample_rate)
--- a/Backend/voice-chat.js
+++ b/Backend/voice-chat.js
--- a/React/src/app/auth/session/route.ts
+++ b/React/src/app/auth/session/route.ts
@@ -0,0 +1,12 @@
 import { NextResponse } from "next/server";
 import { auth0 } from "../../../lib/auth0";
 export async function GET() {
 	try {
 		const session = await auth0.getSession();
 		return NextResponse.json({ session });
 	} catch (error) {
 		console.error("Error getting session:", error);
 		return NextResponse.json({ session: null }, { status: 500 });
 	}
 }
--- a/React/src/app/call/page.tsx
+++ b/React/src/app/call/page.tsx
@@ -78,7 +78,7 @@ function CallPage() {
 				"Content-Type": "application/json",
 			},
 			body: JSON.stringify({
-				message: `yo i need help`,
+				message: `John Smith needs help.`,
 			}),
 		});
--- a/React/src/app/page.tsx
+++ b/React/src/app/page.tsx
@@ -1,40 +1,94 @@
 "use client";
-import { useState } from "react";
+import { useState, useEffect } from "react";
-import { auth0 } from "../lib/auth0";
+import { useRouter } from "next/navigation";
 import { NextApiRequest, NextApiResponse } from "next";
 export default async function Home() {
 export default function Home() {
 	const [contacts, setContacts] = useState<string[]>([]);
 	const [codeword, setCodeword] = useState("");
 	const [session, setSession] = useState<any>(null);
 	const [loading, setLoading] = useState(true);
 	const router = useRouter();
-	const session = await auth0.getSession();
+	useEffect(() => {
 		// Fetch session data from an API route
 		fetch("/auth/session")
 			.then((response) => response.json())
 			.then((data) => {
 				setSession(data.session);
 				setLoading(false);
 			})
 			.catch((error) => {
 				console.error("Failed to fetch session:", error);
 				setLoading(false);
 			});
 	}, []);
-	console.log("Session:", session?.user);
+	function saveToDB() {
 		alert("Saving contacts...");
 		const contactInputs = document.querySelectorAll(
 			".text-input"
 		) as NodeListOf<HTMLInputElement>;
 		const contactValues = Array.from(contactInputs).map((input) => input.value);
 		fetch("/api/databaseStorage", {
 			method: "POST",
 			headers: {
 				"Content-Type": "application/json",
 			},
 			body: JSON.stringify({
 				email: session?.user?.email || "",
 				codeword: codeword,
 				contacts: contactValues,
 			}),
 		})
 			.then((response) => {
 				if (response.ok) {
 					alert("Contacts saved successfully!");
 				} else {
 					alert("Error saving contacts.");
 				}
 			})
 			.catch((error) => {
 				console.error("Error:", error);
 				alert("Error saving contacts.");
 			});
 	}
 	if (loading) {
 		return <div>Loading...</div>;
 	}
 	// If no session, show sign-up and login buttons
 	if (!session) {
 		return (
 			<div className="space-y-7 bg-indigo-800 items-center justify-items-center min-h-screen p-8 pb-20 gap-16 sm:p-20 font-[family-name:var(--font-geist-sans)]">
 				<main className="space-x-2 flex flex-row gap-[32px] row-start-2 items-center sm:items-start">
 					<a href="/auth/login?screen_hint=signup">
-						<button className="box-content w-32 border-2 h-16 text-2xl bg-violet-900 text-green-300">Sign up</button>
+						<button className="box-content w-32 border-2 h-16 text-2xl bg-violet-900 text-green-300">
 							Sign up
 						</button>
 					</a>
 					<a href="/auth/login">
-						<button className = "box-content w-32 border-2 h-16 text-2xl bg-violet-900 text-green-400">Log in</button>
+						<button className="box-content w-32 border-2 h-16 text-2xl bg-violet-900 text-green-400">
 							Log in
 						</button>
 					</a>
 				</main>
-				<h1 className="space-y-3 text-6xl text-lime-500 subpixel-antialiased font-stretch-semi-expanded font-serif">Fauxcall</h1>
+				<h1 className="space-y-3 text-6xl text-lime-500 subpixel-antialiased font-stretch-semi-expanded font-serif">
-				<h2 className="space-y-3 text-6x1 text-red-700 antialiased font-mono">Set emergency contacts</h2>
+					Fauxcall
-				<p>If you stop speaking or say the codeword, these contacts will be notified</p>
+				</h1>
 				<h2 className="space-y-3 text-6x1 text-red-700 antialiased font-mono">
 					Set emergency contacts
 				</h2>
 				<p>
 					If you stop speaking or say the codeword, these contacts will be
 					notified
 				</p>
 				{/* form for setting codeword */}
-				<form className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()}>
+				<form
 					className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start"
 					onSubmit={(e) => e.preventDefault()}
 				>
 					<input
 						type="text"
 						value={codeword}
@@ -44,10 +98,16 @@ export default async function Home() {
 					/>
 					<button
 						className="bg-blue-500 text-white font-semibold font-lg rounded-md p-2"
-					type="submit">Set codeword</button>
+						type="submit"
 					>
 						Set codeword
 					</button>
 				</form>
 				{/* form for adding contacts */}
-				<form className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()}>
+				<form
 					className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start"
 					onSubmit={(e) => e.preventDefault()}
 				>
 					<input
 						type="text"
 						value={contacts}
@@ -70,7 +130,12 @@ export default async function Home() {
 						className="border border-gray-300 rounded-md p-2"
 					/>
 					<button type="button">Add</button>
-					<button className="bg-slate-500 text-yellow-300 text-stretch-50% font-lg rounded-md p-2" type="submit">Set contacts</button>
+					<button
 						className="bg-slate-500 text-yellow-300 text-stretch-50% font-lg rounded-md p-2"
 						type="submit"
 					>
 						Set contacts
 					</button>
 				</form>
 			</div>
 		);
@@ -81,11 +146,21 @@ export default async function Home() {
 			<main className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start">
 				<h1>Welcome, {session.user.name}!</h1>
-					<h1 className="space-y-3 text-6xl text-lime-500 subpixel-antialiased font-stretch-semi-expanded font-serif">Fauxcall</h1>
+				<h1 className="space-y-3 text-6xl text-lime-500 subpixel-antialiased font-stretch-semi-expanded font-serif">
-					<h2 className="space-y-3 text-6x1 text-red-700 antialiased font-mono">Set emergency contacts</h2>
+					Fauxcall
-					<p>If you stop speaking or say the codeword, these contacts will be notified</p>
+				</h1>
 				<h2 className="space-y-3 text-6x1 text-red-700 antialiased font-mono">
 					Set emergency contacts
 				</h2>
 				<p>
 					If you stop speaking or say the codeword, these contacts will be
 					notified
 				</p>
 				{/* form for setting codeword */}
-					<form className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()}>
+				<form
 					className="flex flex-col gap-[32px] row-start-2 items-center sm:items-start"
 					onSubmit={(e) => e.preventDefault()}
 				>
 					<input
 						type="text"
 						value={codeword}
@@ -95,10 +170,17 @@ export default async function Home() {
 					/>
 					<button
 						className="bg-blue-500 text-white font-semibold font-lg rounded-md p-2"
-						type="submit">Set codeword</button>
+						type="submit"
 					>
 						Set codeword
 					</button>
 				</form>
 				{/* form for adding contacts */}
-					<form id="Contacts" className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start" onSubmit={(e) => e.preventDefault()}>
+				<form
 					id="Contacts"
 					className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start"
 					onSubmit={(e) => e.preventDefault()}
 				>
 					<input
 						type="text"
 						value={contacts}
@@ -127,19 +209,37 @@ export default async function Home() {
 						placeholder="Write down an emergency contact"
 						className="text-input border border-gray-300 rounded-md p-2"
 					/>
-					<button onClick={() => {
+					<button
 						onClick={() => {
 							alert("Adding contact...");
-						let elem = document.getElementsByClassName("text-input")[0] as HTMLElement;
+							let elem = document.getElementsByClassName(
 								"text-input"
 							)[0] as HTMLElement;
 							console.log("Element:", elem);
 							let d = elem.cloneNode(true) as HTMLElement;
 							document.getElementById("Contacts")?.appendChild(d);
 						}}
 						className="bg-emerald-500 text-fuchsia-300"
-					type="button">Add</button>
+						type="button"
 					>
 						Add
 					</button>
-						<button className="bg-slate-500 text-yellow-300 text-stretch-50% font-lg rounded-md p-2" type="submit">Set contacts</button>
+					<button
 						type="button"
 						onClick={saveToDB}
 						className="bg-slate-500 text-yellow-300 text-stretch-50% font-lg rounded-md p-2"
 					>
 						Save
 					</button>
 				</form>
-				
+				<div>
 					<a href="/call">
 						<button className="bg-zinc-700 text-lime-300 font-semibold font-lg rounded-md p-2">
 							Call
 						</button>
 					</a>
 				</div>
 				<p>
 					<a href="/auth/logout">
 						<button>Log out</button>
--- a/React/src/pages/api/databaseStorage.ts
+++ b/React/src/pages/api/databaseStorage.ts
@@ -0,0 +1,56 @@
 import { NextApiRequest, NextApiResponse } from "next";
 import mongoose from "mongoose";
 const uri = process.env.MONGODB_URI || "mongodb://localhost:27017/mydatabase";
 const clientOptions = { serverApi: { version: "1" as const, strict: true, deprecationErrors: true } };
 // Create a reusable connection function
 async function connectToDatabase() {
  if (mongoose.connection.readyState === 0) {
    // Only connect if not already connected
    await mongoose.connect(uri, clientOptions);
    console.log("Connected to MongoDB!");
    mongoose.model("User", new mongoose.Schema({
        email: { type: String, required: true, unique: true },
        codeword: { type: String, required: true },
        contacts: [{ type: String }],
        }));
  }
 }
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
  try {
    // Ensure the database is connected
    await connectToDatabase();
    if (req.method === 'POST') {
      const { email, codeword, contacts } = req.body;
      // Perform database operations here
      // query database to see if document with email exists
      const existingUser = await mongoose.model('User').findOne({ email });
      if (existingUser) {
        // If user exists, update their codeword and contacts
        await mongoose.model('User').updateOne({ email }, { codeword, contacts });
      } else {
        // If user does not exist, create a new user
        const User = mongoose.model('User');
        const newUser = new User({ email, codeword, contacts });
        await newUser.save();
      }
      console.log("Codeword:", codeword);
      console.log("Contacts:", contacts);
      res.status(200).json({ success: true, message: "Data saved successfully!" });
    } else {
      res.setHeader('Allow', ['POST']);
      res.status(405).end(`Method ${req.method} Not Allowed`);
    }
  } catch (error) {
    console.error("Error:", error);
    res.status(500).json({ success: false, error: "Internal Server Error" });
  }
 }