This commit is contained in:
idler-wheel
2025-03-30 09:27:10 -04:00
8 changed files with 1356 additions and 632 deletions

View File

@@ -3,454 +3,266 @@
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Voice Assistant - CSM & Whisper</title> <title>AI Voice Chat</title>
<script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
<style> <style>
body { body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin: 0;
padding: 0;
background-color: #f5f5f5;
color: #333;
}
.container {
max-width: 800px; max-width: 800px;
margin: 0 auto; margin: 0 auto;
padding: 20px; padding: 20px;
background-color: #f5f7fa;
color: #333;
} }
header {
h1 {
color: #2c3e50;
text-align: center; text-align: center;
margin-bottom: 30px; margin-bottom: 30px;
} }
h1 {
#conversation { color: #2c3e50;
height: 400px;
border: 1px solid #ddd;
border-radius: 10px;
padding: 20px;
margin-bottom: 20px;
overflow-y: auto;
background-color: white;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
} }
.status-bar {
.message-container { background-color: #2c3e50;
display: flex;
flex-direction: column;
margin-bottom: 15px;
}
.user-message-container {
align-items: flex-end;
}
.bot-message-container {
align-items: flex-start;
}
.message {
max-width: 80%;
padding: 12px;
border-radius: 18px;
position: relative;
word-break: break-word;
}
.user-message {
background-color: #dcf8c6;
color: #000;
border-bottom-right-radius: 4px;
}
.bot-message {
background-color: #f1f0f0;
color: #000;
border-bottom-left-radius: 4px;
}
.message-label {
font-size: 0.8em;
margin-bottom: 4px;
color: #657786;
}
#controls {
display: flex;
gap: 10px;
justify-content: center;
margin-bottom: 15px;
}
button {
padding: 12px 24px;
font-size: 16px;
cursor: pointer;
border-radius: 50px;
border: none;
outline: none;
transition: all 0.3s ease;
}
#recordButton {
background-color: #4CAF50;
color: white; color: white;
width: 200px; padding: 10px;
box-shadow: 0 4px 8px rgba(76, 175, 80, 0.3); border-radius: 5px;
margin-bottom: 20px;
display: flex;
justify-content: space-between;
align-items: center;
} }
.status-indicator {
#recordButton:hover { display: flex;
background-color: #45a049; align-items: center;
transform: translateY(-2px);
} }
.status-dot {
height: 10px;
width: 10px;
border-radius: 50%;
margin-right: 8px;
}
.status-dot.connected { background-color: #2ecc71; }
.status-dot.connecting { background-color: #f39c12; }
.status-dot.disconnected { background-color: #e74c3c; }
#recordButton.recording { .conversation {
background-color: #f44336; background-color: white;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
height: 400px;
padding: 20px;
overflow-y: auto;
margin-bottom: 20px;
}
.message {
margin-bottom: 15px;
padding: 10px 15px;
border-radius: 18px;
max-width: 80%;
word-wrap: break-word;
}
.user-message {
background-color: #e3f2fd;
margin-left: auto;
border-bottom-right-radius: 5px;
}
.ai-message {
background-color: #f0f0f0;
margin-right: auto;
border-bottom-left-radius: 5px;
}
.controls {
display: flex;
justify-content: center;
gap: 15px;
margin-bottom: 20px;
}
button {
background-color: #2c3e50;
color: white;
border: none;
padding: 12px 24px;
border-radius: 25px;
cursor: pointer;
font-size: 16px;
transition: all 0.2s;
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
}
button:hover {
background-color: #1a252f;
}
button:disabled {
background-color: #95a5a6;
cursor: not-allowed;
}
.button-icon {
width: 20px;
height: 20px;
}
.mic-animation {
width: 60px;
height: 60px;
border-radius: 50%;
background-color: rgba(231, 76, 60, 0.2);
display: flex;
align-items: center;
justify-content: center;
animation: pulse 1.5s infinite; animation: pulse 1.5s infinite;
box-shadow: 0 4px 8px rgba(244, 67, 54, 0.3); margin: 0 auto 15px;
} }
@keyframes pulse { @keyframes pulse {
0% { 0% {
transform: scale(1); transform: scale(0.95);
box-shadow: 0 0 0 0 rgba(231, 76, 60, 0.5);
} }
50% { 70% {
transform: scale(1.05); transform: scale(1);
box-shadow: 0 0 0 15px rgba(231, 76, 60, 0);
} }
100% { 100% {
transform: scale(1); transform: scale(0.95);
box-shadow: 0 0 0 0 rgba(231, 76, 60, 0);
} }
} }
.settings {
#status { margin-top: 20px;
padding: 15px;
background-color: white;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.settings h3 {
margin-top: 0;
color: #2c3e50;
border-bottom: 1px solid #eee;
padding-bottom: 10px;
}
.settings-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 15px;
}
.setting-item {
padding: 10px;
background-color: #f9f9f9;
border-radius: 5px;
}
.audio-visualizer {
height: 50px;
width: 100%;
background-color: #f0f0f0;
margin-top: 10px;
border-radius: 5px;
overflow: hidden;
}
.info-message {
text-align: center; text-align: center;
margin-top: 15px; color: #7f8c8d;
margin: 10px 0;
font-style: italic; font-style: italic;
color: #657786;
} }
.loading {
.audio-wave { text-align: center;
display: flex; margin: 20px 0;
justify-content: center;
align-items: center;
height: 40px;
gap: 3px;
} }
.spinner {
.audio-wave span { border: 4px solid rgba(0, 0, 0, 0.1);
display: block; border-radius: 50%;
width: 3px; border-top: 4px solid #2c3e50;
height: 100%; width: 30px;
background-color: #4CAF50;
animation: wave 1.5s infinite ease-in-out;
border-radius: 6px;
}
.audio-wave span:nth-child(2) {
animation-delay: 0.2s;
}
.audio-wave span:nth-child(3) {
animation-delay: 0.4s;
}
.audio-wave span:nth-child(4) {
animation-delay: 0.6s;
}
.audio-wave span:nth-child(5) {
animation-delay: 0.8s;
}
@keyframes wave {
0%, 100% {
height: 8px;
}
50% {
height: 30px; height: 30px;
animation: spin 1s linear infinite;
margin: 0 auto 10px;
} }
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
} }
footer {
.hidden { text-align: center;
display: none; margin-top: 30px;
} padding: 20px;
color: #7f8c8d;
.transcription-info { font-size: 14px;
font-size: 0.8em;
color: #888;
margin-top: 4px;
text-align: right;
} }
</style> </style>
</head> </head>
<body> <body>
<h1>Voice Assistant with CSM & Whisper</h1> <div class="container">
<div id="conversation"></div> <header>
<h1>AI Voice Assistant</h1>
</header>
<div id="controls"> <div class="status-bar">
<button id="recordButton">Hold to Speak</button> <div class="status-indicator">
<div class="status-dot disconnected" id="connection-dot"></div>
<span id="connection-status">Disconnected</span>
</div>
<div id="runtime-info">
<span id="models-status"></span>
</div>
</div> </div>
<div id="audioWave" class="audio-wave hidden"> <div class="conversation" id="conversation">
<span></span> <div class="info-message">Your conversation will appear here.</div>
<span></span>
<span></span>
<span></span>
<span></span>
</div> </div>
<div id="status">Connecting to server...</div> <div id="mic-animation" class="mic-animation" style="display: none;">
<svg width="24" height="24" viewBox="0 0 24 24" fill="white">
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
</svg>
</div>
<script> <div class="controls">
const socket = io(); <button id="start-button" disabled>
const recordButton = document.getElementById('recordButton'); <svg class="button-icon" viewBox="0 0 24 24" fill="white">
const conversation = document.getElementById('conversation'); <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
const status = document.getElementById('status'); </svg>
const audioWave = document.getElementById('audioWave'); Start Listening
</button>
<button id="interrupt-button" disabled>
<svg class="button-icon" viewBox="0 0 24 24" fill="white">
<path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"></path>
</svg>
Interrupt
</button>
</div>
let mediaRecorder; <div id="loading" class="loading" style="display: none;">
let audioChunks = []; <div class="spinner"></div>
let isRecording = false; <p id="loading-text">Processing your speech...</p>
let audioSendInterval; </div>
let sessionActive = false;
// Initialize audio context <div class="settings">
const audioContext = new (window.AudioContext || window.webkitAudioContext)(); <h3>Status</h3>
<div class="settings-grid">
<div class="setting-item">
<div><strong>Whisper Model:</strong> <span id="whisper-status">Loading...</span></div>
</div>
<div class="setting-item">
<div><strong>CSM Audio Model:</strong> <span id="csm-status">Loading...</span></div>
</div>
<div class="setting-item">
<div><strong>LLM Model:</strong> <span id="llm-status">Loading...</span></div>
</div>
<div class="setting-item">
<div><strong>WebRTC:</strong> <span id="webrtc-status">Not Connected</span></div>
</div>
</div>
</div>
</div>
// Connect to server <footer>
socket.on('connect', () => { <p>AI Voice Assistant | Using Fast Whisper, Llama 3.2, and CSM Audio Models</p>
status.textContent = 'Connected to server'; </footer>
sessionActive = true;
});
socket.on('disconnect', () => { <script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.6.1/socket.io.min.js"></script>
status.textContent = 'Disconnected from server'; <script src="./voice-chat.js"></script>
sessionActive = false;
});
socket.on('ready', (data) => {
status.textContent = data.message;
setupAudioRecording();
});
socket.on('transcription', (data) => {
addMessage('user', data.text);
status.textContent = 'Assistant is thinking...';
});
socket.on('audio_response', (data) => {
// Play audio
status.textContent = 'Playing response...';
const audio = new Audio('data:audio/wav;base64,' + data.audio);
audio.onended = () => {
status.textContent = 'Ready to record';
};
audio.onerror = () => {
status.textContent = 'Error playing audio';
console.error('Error playing audio response');
};
audio.play().catch(err => {
status.textContent = 'Error playing audio: ' + err.message;
console.error('Error playing audio:', err);
});
// Display text
addMessage('bot', data.text);
});
socket.on('error', (data) => {
status.textContent = 'Error: ' + data.message;
console.error('Server error:', data.message);
});
function setupAudioRecording() {
// Check if browser supports required APIs
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
status.textContent = 'Your browser does not support audio recording';
return;
}
// Get user media
navigator.mediaDevices.getUserMedia({ audio: true })
.then(stream => {
// Setup recording with better audio quality
const options = {
mimeType: 'audio/webm',
audioBitsPerSecond: 128000
};
try {
mediaRecorder = new MediaRecorder(stream, options);
} catch (e) {
// Fallback if the specified options aren't supported
mediaRecorder = new MediaRecorder(stream);
}
mediaRecorder.ondataavailable = event => {
if (event.data.size > 0) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = () => {
processRecording();
};
// Create audio analyzer for visualization
const source = audioContext.createMediaStreamSource(stream);
const analyzer = audioContext.createAnalyser();
analyzer.fftSize = 2048;
source.connect(analyzer);
// Setup button handlers with better touch handling
recordButton.addEventListener('mousedown', startRecording);
recordButton.addEventListener('touchstart', (e) => {
e.preventDefault(); // Prevent default touch behavior
startRecording();
});
recordButton.addEventListener('mouseup', stopRecording);
recordButton.addEventListener('touchend', (e) => {
e.preventDefault();
stopRecording();
});
recordButton.addEventListener('mouseleave', stopRecording);
status.textContent = 'Ready to record';
})
.catch(err => {
status.textContent = 'Error accessing microphone: ' + err.message;
console.error('Error accessing microphone:', err);
});
}
function startRecording() {
if (!isRecording && sessionActive) {
audioChunks = [];
mediaRecorder.start(100); // Collect data in 100ms chunks
recordButton.classList.add('recording');
recordButton.textContent = 'Release to Stop';
status.textContent = 'Recording...';
audioWave.classList.remove('hidden');
isRecording = true;
socket.emit('start_speaking');
// Start sending audio chunks periodically
audioSendInterval = setInterval(() => {
if (mediaRecorder.state === 'recording') {
mediaRecorder.requestData(); // Force ondataavailable to fire
}
}, 300); // Send every 300ms
}
}
function stopRecording() {
if (isRecording) {
clearInterval(audioSendInterval);
mediaRecorder.stop();
recordButton.classList.remove('recording');
recordButton.textContent = 'Hold to Speak';
status.textContent = 'Processing speech...';
audioWave.classList.add('hidden');
isRecording = false;
}
}
function processRecording() {
if (audioChunks.length === 0) {
status.textContent = 'No audio recorded';
return;
}
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
// Convert to ArrayBuffer for processing
const fileReader = new FileReader();
fileReader.onloadend = () => {
try {
const arrayBuffer = fileReader.result;
// Convert to Float32Array - this works better with WebAudio API
const audioData = convertToFloat32(arrayBuffer);
// Convert to base64 for sending
const base64String = arrayBufferToBase64(audioData.buffer);
socket.emit('audio_chunk', { audio: base64String });
// Signal end of speech
socket.emit('stop_speaking');
} catch (e) {
console.error('Error processing audio:', e);
status.textContent = 'Error processing audio';
}
};
fileReader.onerror = () => {
status.textContent = 'Error reading audio data';
};
fileReader.readAsArrayBuffer(audioBlob);
}
function convertToFloat32(arrayBuffer) {
// Get raw audio data as Int16 (common format for audio)
const int16Array = new Int16Array(arrayBuffer);
// Convert to Float32 (normalize between -1 and 1)
const float32Array = new Float32Array(int16Array.length);
for (let i = 0; i < int16Array.length; i++) {
float32Array[i] = int16Array[i] / 32768.0;
}
return float32Array;
}
function addMessage(sender, text) {
const containerDiv = document.createElement('div');
containerDiv.className = sender === 'user' ? 'message-container user-message-container' : 'message-container bot-message-container';
const labelDiv = document.createElement('div');
labelDiv.className = 'message-label';
labelDiv.textContent = sender === 'user' ? 'You' : 'Assistant';
containerDiv.appendChild(labelDiv);
const messageDiv = document.createElement('div');
messageDiv.className = sender === 'user' ? 'message user-message' : 'message bot-message';
messageDiv.textContent = text;
containerDiv.appendChild(messageDiv);
if (sender === 'user') {
const infoDiv = document.createElement('div');
infoDiv.className = 'transcription-info';
infoDiv.textContent = 'Transcribed with Whisper';
containerDiv.appendChild(infoDiv);
}
conversation.appendChild(containerDiv);
conversation.scrollTop = conversation.scrollHeight;
}
function arrayBufferToBase64(buffer) {
let binary = '';
const bytes = new Uint8Array(buffer);
const len = bytes.byteLength;
for (let i = 0; i < len; i++) {
binary += String.fromCharCode(bytes[i]);
}
return window.btoa(binary);
}
// Handle page visibility change to avoid issues with background tabs
document.addEventListener('visibilitychange', () => {
if (document.hidden && isRecording) {
stopRecording();
}
});
// Clean disconnection when page is closed
window.addEventListener('beforeunload', () => {
if (socket && socket.connected) {
socket.disconnect();
}
});
</script>
</body> </body>
</html> </html>

View File

@@ -8,15 +8,14 @@ import numpy as np
from flask import Flask, render_template, request from flask import Flask, render_template, request
from flask_socketio import SocketIO, emit from flask_socketio import SocketIO, emit
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import deque import threading
import queue
import requests import requests
import huggingface_hub import huggingface_hub
from generator import load_csm_1b, Segment from generator import load_csm_1b, Segment
from collections import deque
# Force CPU mode regardless of what's available import json
# This bypasses the CUDA/cuDNN library requirements import webrtcvad # For voice activity detection
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Hide all CUDA devices
torch.backends.cudnn.enabled = False # Disable cuDNN
# Configure environment with longer timeouts # Configure environment with longer timeouts
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads
@@ -27,28 +26,92 @@ os.makedirs("models", exist_ok=True)
app = Flask(__name__) app = Flask(__name__)
app.config['SECRET_KEY'] = 'your-secret-key' app.config['SECRET_KEY'] = 'your-secret-key'
socketio = SocketIO(app, cors_allowed_origins="*") socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
# Force CPU regardless of what hardware is available # Explicitly check for CUDA and print detailed info
device = "cuda" if torch.cuda.is_available() else "cpu" print("\n=== CUDA Information ===")
whisper_compute_type = "int8" if torch.cuda.is_available():
print(f"Forcing CPU mode for all models") print(f"CUDA is available")
print(f"CUDA version: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
print("CUDA is not available")
# Check for cuDNN
try:
import ctypes
ctypes.CDLL("libcudnn_ops_infer.so.8")
print("cuDNN is available")
except:
print("cuDNN is not available (libcudnn_ops_infer.so.8 not found)")
# Determine compute device
try:
if torch.cuda.is_available():
device = "cuda"
whisper_compute_type = "float16"
print("🟢 CUDA is available and initialized successfully")
elif torch.backends.mps.is_available():
device = "mps"
whisper_compute_type = "float32"
print("🟢 MPS is available (Apple Silicon)")
else:
device = "cpu"
whisper_compute_type = "int8"
print("🟡 Using CPU (CUDA/MPS not available)")
except Exception as e:
print(f"🔴 Error initializing CUDA: {e}")
print("🔴 Falling back to CPU")
device = "cpu"
whisper_compute_type = "int8"
print(f"Using device: {device}")
# Initialize models with proper error handling # Initialize models with proper error handling
whisper_model = None whisper_model = None
csm_generator = None csm_generator = None
llm_model = None llm_model = None
llm_tokenizer = None llm_tokenizer = None
vad = None
# Constants
SAMPLE_RATE = 16000 # For VAD
VAD_FRAME_SIZE = 480 # 30ms at 16kHz for VAD
VAD_MODE = 3 # Aggressive mode for better results
AUDIO_CHUNK_SIZE = 2400 # 100ms chunks when streaming AI voice
# Audio sample rates
CLIENT_SAMPLE_RATE = 44100 # Browser WebAudio default
WHISPER_SAMPLE_RATE = 16000 # Whisper expects 16kHz
# Session data structures
user_sessions = {} # session_id -> complete session data
# WebRTC ICE servers (STUN/TURN servers for NAT traversal)
ICE_SERVERS = [
{"urls": "stun:stun.l.google.com:19302"},
{"urls": "stun:stun1.l.google.com:19302"}
]
def load_models(): def load_models():
global whisper_model, csm_generator, llm_model, llm_tokenizer """Load all necessary models"""
global whisper_model, csm_generator, llm_model, llm_tokenizer, vad
# Initialize Voice Activity Detector
try:
vad = webrtcvad.Vad(VAD_MODE)
print("Voice Activity Detector initialized")
except Exception as e:
print(f"Error initializing VAD: {e}")
vad = None
# Initialize Faster-Whisper for transcription # Initialize Faster-Whisper for transcription
try: try:
print("Loading Whisper model on CPU...") print("Loading Whisper model...")
# Import here to avoid immediate import errors if package is missing
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8", download_root="./models/whisper") whisper_model = WhisperModel("base", device=device, compute_type=whisper_compute_type, download_root="./models/whisper")
print("Whisper model loaded successfully") print("Whisper model loaded successfully")
except Exception as e: except Exception as e:
print(f"Error loading Whisper model: {e}") print(f"Error loading Whisper model: {e}")
@@ -56,8 +119,8 @@ def load_models():
# Initialize CSM model for audio generation # Initialize CSM model for audio generation
try: try:
print("Loading CSM model on CPU...") print("Loading CSM model...")
csm_generator = load_csm_1b(device="cpu") csm_generator = load_csm_1b(device=device)
print("CSM model loaded successfully") print("CSM model loaded successfully")
except Exception as e: except Exception as e:
print(f"Error loading CSM model: {e}") print(f"Error loading CSM model: {e}")
@@ -65,13 +128,14 @@ def load_models():
# Initialize Llama 3.2 model for response generation # Initialize Llama 3.2 model for response generation
try: try:
print("Loading Llama 3.2 model on CPU...") print("Loading Llama 3.2 model...")
llm_model_id = "meta-llama/Llama-3.2-1B" # Choose appropriate size based on resources llm_model_id = "meta-llama/Llama-3.2-1B"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama") llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama")
dtype = torch.bfloat16 if device != "cpu" else torch.float32
llm_model = AutoModelForCausalLM.from_pretrained( llm_model = AutoModelForCausalLM.from_pretrained(
llm_model_id, llm_model_id,
torch_dtype=torch.float32, # Use float32 on CPU torch_dtype=dtype,
device_map="cpu", device_map=device,
cache_dir="./models/llama", cache_dir="./models/llama",
low_cpu_mem_usage=True low_cpu_mem_usage=True
) )
@@ -80,168 +144,344 @@ def load_models():
print(f"Error loading Llama 3.2 model: {e}") print(f"Error loading Llama 3.2 model: {e}")
print("Will use a fallback response generation method") print("Will use a fallback response generation method")
# Store conversation context
conversation_context = {} # session_id -> context
@app.route('/') @app.route('/')
def index(): def index():
"""Serve the main interface"""
return render_template('index.html') return render_template('index.html')
@app.route('/voice-chat.js')
def voice_chat_js():
"""Serve the JavaScript for voice chat"""
return app.send_static_file('voice-chat.js')
@socketio.on('connect') @socketio.on('connect')
def handle_connect(): def handle_connect():
print(f"Client connected: {request.sid}") """Handle new client connection"""
conversation_context[request.sid] = { session_id = request.sid
print(f"Client connected: {session_id}")
# Initialize session data
user_sessions[session_id] = {
# Conversation context
'segments': [], 'segments': [],
'speakers': [0, 1], # 0 = user, 1 = bot 'conversation_history': [],
'audio_buffer': deque(maxlen=10), # Store recent audio chunks 'is_turn_active': False,
'is_speaking': False,
'silence_start': None # Audio buffers and state
'vad_buffer': deque(maxlen=30), # ~1s of audio at 30fps
'audio_buffer': bytearray(),
'is_user_speaking': False,
'last_vad_active': time.time(),
'silence_duration': 0,
'speech_frames': 0,
# AI state
'is_ai_speaking': False,
'should_interrupt_ai': False,
'ai_stream_queue': queue.Queue(),
# WebRTC status
'webrtc_connected': False,
'webrtc_peer_id': None,
# Processing flags
'is_processing': False,
'pending_user_audio': None
} }
emit('ready', {'message': 'Connection established'})
# Send config to client
emit('session_ready', {
'whisper_available': whisper_model is not None,
'csm_available': csm_generator is not None,
'llm_available': llm_model is not None,
'client_sample_rate': CLIENT_SAMPLE_RATE,
'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
'ice_servers': ICE_SERVERS
})
@socketio.on('disconnect') @socketio.on('disconnect')
def handle_disconnect(): def handle_disconnect():
print(f"Client disconnected: {request.sid}") """Handle client disconnection"""
if request.sid in conversation_context: session_id = request.sid
del conversation_context[request.sid] print(f"Client disconnected: {session_id}")
@socketio.on('start_speaking') # Clean up resources
def handle_start_speaking(): if session_id in user_sessions:
if request.sid in conversation_context: # Signal any running threads to stop
conversation_context[request.sid]['is_speaking'] = True user_sessions[session_id]['should_interrupt_ai'] = True
conversation_context[request.sid]['audio_buffer'].clear()
print(f"User {request.sid} started speaking")
@socketio.on('audio_chunk') # Clean up resources
def handle_audio_chunk(data): del user_sessions[session_id]
if request.sid not in conversation_context:
@socketio.on('webrtc_signal')
def handle_webrtc_signal(data):
"""Handle WebRTC signaling for P2P connection establishment"""
session_id = request.sid
if session_id not in user_sessions:
return return
context = conversation_context[request.sid] # Simply relay the signal to the client
# In a multi-user app, we would route this to the correct peer
emit('webrtc_signal', data)
@socketio.on('webrtc_connected')
def handle_webrtc_connected(data):
"""Client notifies that WebRTC connection is established"""
session_id = request.sid
if session_id not in user_sessions:
return
user_sessions[session_id]['webrtc_connected'] = True
print(f"WebRTC connected for session {session_id}")
emit('ready_for_speech', {'message': 'Ready to start conversation'})
@socketio.on('audio_stream')
def handle_audio_stream(data):
"""Process incoming audio stream packets from client"""
session_id = request.sid
if session_id not in user_sessions:
return
session = user_sessions[session_id]
try:
# Decode audio data # Decode audio data
audio_data = base64.b64decode(data['audio']) audio_bytes = base64.b64decode(data.get('audio', ''))
audio_numpy = np.frombuffer(audio_data, dtype=np.float32) if not audio_bytes or len(audio_bytes) < 2: # Need at least one sample
audio_tensor = torch.tensor(audio_numpy)
# Add to buffer
context['audio_buffer'].append(audio_tensor)
# Check for silence to detect end of speech
if context['is_speaking'] and is_silence(audio_tensor):
if context['silence_start'] is None:
context['silence_start'] = time.time()
elif time.time() - context['silence_start'] > 1.0: # 1 second of silence
# Process the complete utterance
process_user_utterance(request.sid)
else:
context['silence_start'] = None
@socketio.on('stop_speaking')
def handle_stop_speaking():
if request.sid in conversation_context:
conversation_context[request.sid]['is_speaking'] = False
process_user_utterance(request.sid)
print(f"User {request.sid} stopped speaking")
def is_silence(audio_tensor, threshold=0.02):
"""Check if an audio chunk is silence based on amplitude threshold"""
return torch.mean(torch.abs(audio_tensor)) < threshold
def process_user_utterance(session_id):
"""Process completed user utterance, generate response and send audio back"""
context = conversation_context[session_id]
if not context['audio_buffer']:
return return
# Combine audio chunks # Add to current audio buffer
full_audio = torch.cat(list(context['audio_buffer']), dim=0) session['audio_buffer'] += audio_bytes
context['audio_buffer'].clear()
context['is_speaking'] = False
context['silence_start'] = None
# Save audio to temporary WAV file for transcription # Check for speech using VAD
has_speech = detect_speech(audio_bytes, session_id)
# Handle speech state machine
if has_speech:
# Reset silence tracking when speech is detected
session['last_vad_active'] = time.time()
session['silence_duration'] = 0
session['speech_frames'] += 1
# If not already marked as speaking and we have enough speech frames
if not session['is_user_speaking'] and session['speech_frames'] >= 5:
on_speech_started(session_id)
else:
# No speech detected in this frame
if session['is_user_speaking']:
# Calculate silence duration
now = time.time()
session['silence_duration'] = now - session['last_vad_active']
# If silent for more than 0.5 seconds, end speech segment
if session['silence_duration'] > 0.8 and session['speech_frames'] > 8:
on_speech_ended(session_id)
else:
# Not speaking and no speech, just a silent frame
session['speech_frames'] = max(0, session['speech_frames'] - 1)
except Exception as e:
print(f"Error processing audio stream: {e}")
def detect_speech(audio_bytes, session_id):
"""Use VAD to check if audio contains speech"""
if session_id not in user_sessions:
return False
session = user_sessions[session_id]
# Store in VAD buffer for history
session['vad_buffer'].append(audio_bytes)
if vad is None:
# Fallback to simple energy detection
audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
energy = np.mean(np.abs(audio_data)) / 32768.0
return energy > 0.015 # Simple threshold
try:
# Ensure we have the right amount of data for VAD
audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
# If we have too much data, use just the right amount
if len(audio_data) >= VAD_FRAME_SIZE:
frame = audio_data[:VAD_FRAME_SIZE].tobytes()
return vad.is_speech(frame, SAMPLE_RATE)
# If too little data, accumulate in the VAD buffer and check periodically
if len(session['vad_buffer']) >= 3:
# Combine recent chunks to get enough data
combined = bytearray()
for chunk in list(session['vad_buffer'])[-3:]:
combined.extend(chunk)
# Extract the right amount of data
if len(combined) >= VAD_FRAME_SIZE:
frame = combined[:VAD_FRAME_SIZE]
return vad.is_speech(bytes(frame), SAMPLE_RATE)
return False
except Exception as e:
print(f"VAD error: {e}")
return False
def on_speech_started(session_id):
"""Handle start of user speech"""
if session_id not in user_sessions:
return
session = user_sessions[session_id]
# Reset audio buffer
session['audio_buffer'] = bytearray()
session['is_user_speaking'] = True
session['is_turn_active'] = True
# If AI is speaking, we need to interrupt it
if session['is_ai_speaking']:
session['should_interrupt_ai'] = True
emit('ai_interrupted_by_user', room=session_id)
# Notify client that we detected speech
emit('user_speech_start', room=session_id)
def on_speech_ended(session_id):
"""Handle end of user speech segment"""
if session_id not in user_sessions:
return
session = user_sessions[session_id]
# Mark as not speaking anymore
session['is_user_speaking'] = False
session['speech_frames'] = 0
# If no audio or already processing, skip
if len(session['audio_buffer']) < 4000 or session['is_processing']: # At least 250ms of audio
session['audio_buffer'] = bytearray()
return
# Mark as processing to prevent multiple processes
session['is_processing'] = True
# Create a copy of the audio buffer
audio_copy = session['audio_buffer']
session['audio_buffer'] = bytearray()
# Convert audio to the format needed for processing
try:
# Convert to float32 between -1 and 1
audio_np = np.frombuffer(audio_copy, dtype=np.int16).astype(np.float32) / 32768.0
audio_tensor = torch.from_numpy(audio_np)
# Resample to Whisper's expected sample rate if necessary
if CLIENT_SAMPLE_RATE != WHISPER_SAMPLE_RATE:
audio_tensor = torchaudio.functional.resample(
audio_tensor,
orig_freq=CLIENT_SAMPLE_RATE,
new_freq=WHISPER_SAMPLE_RATE
)
# Save as WAV for transcription
temp_audio_path = f"temp_audio_{session_id}.wav" temp_audio_path = f"temp_audio_{session_id}.wav"
torchaudio.save( torchaudio.save(
temp_audio_path, temp_audio_path,
full_audio.unsqueeze(0), audio_tensor.unsqueeze(0),
44100 # Assuming 44.1kHz from client WHISPER_SAMPLE_RATE
) )
try: # Start transcription and response process in a thread
# Try using Whisper first if available threading.Thread(
if whisper_model is not None: target=process_user_utterance,
user_text = transcribe_with_whisper(temp_audio_path) args=(session_id, temp_audio_path, audio_tensor),
else: daemon=True
# Fallback to Google's speech recognition ).start()
user_text = transcribe_with_google(temp_audio_path)
if not user_text: # Notify client that processing has started
print("No speech detected.") emit('processing_speech', room=session_id)
emit('error', {'message': 'No speech detected. Please try again.'}, room=session_id)
except Exception as e:
print(f"Error preparing audio: {e}")
session['is_processing'] = False
emit('error', {'message': f'Error processing audio: {str(e)}'}, room=session_id)
def process_user_utterance(session_id, audio_path, audio_tensor):
"""Process user utterance, transcribe and generate response"""
if session_id not in user_sessions:
return
session = user_sessions[session_id]
try:
# Transcribe audio
if whisper_model is not None:
user_text = transcribe_with_whisper(audio_path)
else:
# Fallback to another transcription service
user_text = transcribe_fallback(audio_path)
# Clean up temp file
if os.path.exists(audio_path):
os.remove(audio_path)
# Check if we got meaningful text
if not user_text or len(user_text.strip()) < 2:
emit('no_speech_detected', room=session_id)
session['is_processing'] = False
return return
print(f"Transcribed: {user_text}") print(f"Transcribed: {user_text}")
# Add to conversation segments # Create user segment
user_segment = Segment( user_segment = Segment(
text=user_text, text=user_text,
speaker=0, # User is speaker 0 speaker=0, # User is speaker 0
audio=full_audio audio=audio_tensor
) )
context['segments'].append(user_segment) session['segments'].append(user_segment)
# Generate bot response # Update conversation history
bot_response = generate_llm_response(user_text, context['segments']) session['conversation_history'].append({
print(f"Bot response: {bot_response}") 'role': 'user',
'text': user_text
})
# Send transcribed text to client # Send transcription to client
emit('transcription', {'text': user_text}, room=session_id) emit('transcription', {'text': user_text}, room=session_id)
# Generate and send audio response if CSM is available # Generate AI response
ai_response = generate_ai_response(user_text, session_id)
# Send text response to client
emit('ai_response_text', {'text': ai_response}, room=session_id)
# Update conversation history
session['conversation_history'].append({
'role': 'assistant',
'text': ai_response
})
# Generate voice response if CSM is available
if csm_generator is not None: if csm_generator is not None:
# Convert to audio using CSM session['is_ai_speaking'] = True
bot_audio = generate_audio_response(bot_response, context['segments']) session['should_interrupt_ai'] = False
# Convert audio to base64 for sending over websocket # Begin streaming audio response
audio_bytes = io.BytesIO() threading.Thread(
torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav") target=stream_ai_response,
audio_bytes.seek(0) args=(ai_response, session_id),
audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8') daemon=True
).start()
# Add bot response to conversation history
bot_segment = Segment(
text=bot_response,
speaker=1, # Bot is speaker 1
audio=bot_audio
)
context['segments'].append(bot_segment)
# Send audio response to client
emit('audio_response', {
'audio': audio_b64,
'text': bot_response
}, room=session_id)
else:
# Send text-only response if audio generation isn't available
emit('text_response', {'text': bot_response}, room=session_id)
# Add text-only bot response to conversation history
bot_segment = Segment(
text=bot_response,
speaker=1, # Bot is speaker 1
audio=torch.zeros(1) # Placeholder empty audio
)
context['segments'].append(bot_segment)
except Exception as e: except Exception as e:
print(f"Error processing speech: {e}") print(f"Error processing utterance: {e}")
emit('error', {'message': f'Error processing speech: {str(e)}'}, room=session_id) emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
finally: finally:
# Cleanup temp file # Clear processing flag
if os.path.exists(temp_audio_path): if session_id in user_sessions:
os.remove(temp_audio_path) session['is_processing'] = False
def transcribe_with_whisper(audio_path): def transcribe_with_whisper(audio_path):
"""Transcribe audio using Faster-Whisper""" """Transcribe audio using Faster-Whisper"""
@@ -250,14 +490,13 @@ def transcribe_with_whisper(audio_path):
# Collect all text from segments # Collect all text from segments
user_text = "" user_text = ""
for segment in segments: for segment in segments:
segment_text = segment.text.strip() user_text += segment.text.strip() + " "
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment_text}")
user_text += segment_text + " "
return user_text.strip() return user_text.strip()
def transcribe_with_google(audio_path): def transcribe_fallback(audio_path):
"""Fallback transcription using Google's speech recognition""" """Fallback transcription using Google's speech recognition"""
try:
import speech_recognition as sr import speech_recognition as sr
recognizer = sr.Recognizer() recognizer = sr.Recognizer()
@@ -269,28 +508,40 @@ def transcribe_with_google(audio_path):
except sr.UnknownValueError: except sr.UnknownValueError:
return "" return ""
except sr.RequestError: except sr.RequestError:
# If Google API fails, try a basic energy-based VAD approach return "[Speech recognition service unavailable]"
# This is a very basic fallback and won't give good results except ImportError:
return "[Speech detected but transcription failed]" return "[Speech recognition not available]"
def generate_ai_response(user_text, session_id):
"""Generate text response using available LLM"""
if session_id not in user_sessions:
return "I'm sorry, your session has expired."
session = user_sessions[session_id]
def generate_llm_response(user_text, conversation_segments):
"""Generate text response using available model"""
if llm_model is not None and llm_tokenizer is not None: if llm_model is not None and llm_tokenizer is not None:
# Format conversation history for the LLM # Format conversation history for the LLM
conversation_history = "" prompt = "You are a helpful, friendly voice assistant. Keep your responses brief and conversational.\n\n"
for segment in conversation_segments[-5:]: # Use last 5 utterances for context
speaker_name = "User" if segment.speaker == 0 else "Assistant"
conversation_history += f"{speaker_name}: {segment.text}\n"
# Add the current user query # Add recent conversation history (last 6 turns maximum)
conversation_history += f"User: {user_text}\nAssistant:" for entry in session['conversation_history'][-6:]:
if entry['role'] == 'user':
prompt += f"User: {entry['text']}\n"
else:
prompt += f"Assistant: {entry['text']}\n"
# Add current query if not already in history
if not session['conversation_history'] or session['conversation_history'][-1]['role'] != 'user':
prompt += f"User: {user_text}\n"
prompt += "Assistant: "
try: try:
# Generate response # Generate response
inputs = llm_tokenizer(conversation_history, return_tensors="pt").to(device) inputs = llm_tokenizer(prompt, return_tensors="pt").to(device)
output = llm_model.generate( output = llm_model.generate(
inputs.input_ids, inputs.input_ids,
max_new_tokens=150, max_new_tokens=100, # Keep responses shorter for voice
temperature=0.7, temperature=0.7,
top_p=0.9, top_p=0.9,
do_sample=True do_sample=True
@@ -298,40 +549,48 @@ def generate_llm_response(user_text, conversation_segments):
response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response.strip() return response.strip()
except Exception as e: except Exception as e:
print(f"Error generating response with LLM: {e}") print(f"Error generating LLM response: {e}")
return fallback_response(user_text) return fallback_response(user_text)
else: else:
return fallback_response(user_text) return fallback_response(user_text)
def fallback_response(user_text): def fallback_response(user_text):
"""Generate a simple fallback response when LLM is not available""" """Generate simple fallback responses when LLM is unavailable"""
# Simple rule-based responses
user_text_lower = user_text.lower() user_text_lower = user_text.lower()
if "hello" in user_text_lower or "hi" in user_text_lower: if "hello" in user_text_lower or "hi" in user_text_lower:
return "Hello! I'm a simple fallback assistant. The main language model couldn't be loaded, so I have limited capabilities." return "Hello! How can I help you today?"
elif "how are you" in user_text_lower: elif "how are you" in user_text_lower:
return "I'm functioning within my limited capabilities. How can I assist you today?" return "I'm doing well, thanks for asking! How about you?"
elif "thank" in user_text_lower: elif "thank" in user_text_lower:
return "You're welcome! Let me know if there's anything else I can help with." return "You're welcome! Happy to help."
elif "bye" in user_text_lower or "goodbye" in user_text_lower: elif "bye" in user_text_lower or "goodbye" in user_text_lower:
return "Goodbye! Have a great day!" return "Goodbye! Have a great day!"
elif any(q in user_text_lower for q in ["what", "who", "where", "when", "why", "how"]): elif any(q in user_text_lower for q in ["what", "who", "where", "when", "why", "how"]):
return "I'm running in fallback mode and can't answer complex questions. Please try again when the main language model is available." return "That's an interesting question. I wish I could provide a better answer in my current fallback mode."
else: else:
return "I understand you said something about that. Unfortunately, I'm running in fallback mode with limited capabilities. Please try again later when the main model is available." return "I see. Tell me more about that."
def stream_ai_response(text, session_id):
"""Generate and stream audio response in real-time chunks"""
if session_id not in user_sessions:
return
session = user_sessions[session_id]
def generate_audio_response(text, conversation_segments):
"""Generate audio response using CSM"""
try: try:
# Use the last few conversation segments as context # Signal start of AI speech
context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments emit('ai_speech_start', room=session_id)
# Use the last few conversation segments as context (up to 4)
context_segments = session['segments'][-4:] if len(session['segments']) > 4 else session['segments']
# Generate audio for bot response # Generate audio for bot response
audio = csm_generator.generate( audio = csm_generator.generate(
@@ -343,11 +602,77 @@ def generate_audio_response(text, conversation_segments):
topk=50 topk=50
) )
return audio # Create and store bot segment
bot_segment = Segment(
text=text,
speaker=1,
audio=audio
)
if session_id in user_sessions:
session['segments'].append(bot_segment)
# Stream audio in small chunks for more responsive playback
chunk_size = AUDIO_CHUNK_SIZE # Size defined in constants
for i in range(0, len(audio), chunk_size):
# Check if we should stop (user interrupted)
if session_id not in user_sessions or session['should_interrupt_ai']:
print("AI speech interrupted")
break
# Get next chunk
chunk = audio[i:i+chunk_size]
# Convert audio chunk to base64 for streaming
audio_bytes = io.BytesIO()
torchaudio.save(audio_bytes, chunk.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav")
audio_bytes.seek(0)
audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
# Send chunk to client
socketio.emit('ai_speech_chunk', {
'audio': audio_b64,
'is_last': i + chunk_size >= len(audio)
}, room=session_id)
# Small sleep for more natural pacing
time.sleep(0.06) # Slight delay for smoother playback
# Signal end of AI speech
if session_id in user_sessions:
session['is_ai_speaking'] = False
session['is_turn_active'] = False # End conversation turn
socketio.emit('ai_speech_end', room=session_id)
except Exception as e: except Exception as e:
print(f"Error generating audio: {e}") print(f"Error streaming AI response: {e}")
# Return silence as fallback if session_id in user_sessions:
return torch.zeros(csm_generator.sample_rate * 3) # 3 seconds of silence session['is_ai_speaking'] = False
session['is_turn_active'] = False
socketio.emit('error', {'message': f'Error generating audio: {str(e)}'}, room=session_id)
socketio.emit('ai_speech_end', room=session_id)
@socketio.on('interrupt_ai')
def handle_interrupt():
"""Handle explicit AI interruption request from client"""
session_id = request.sid
if session_id in user_sessions:
user_sessions[session_id]['should_interrupt_ai'] = True
emit('ai_interrupted', room=session_id)
@socketio.on('get_config')
def handle_get_config():
"""Send configuration to client"""
session_id = request.sid
if session_id in user_sessions:
emit('config', {
'client_sample_rate': CLIENT_SAMPLE_RATE,
'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
'whisper_available': whisper_model is not None,
'csm_available': csm_generator is not None,
'ice_servers': ICE_SERVERS
})
if __name__ == '__main__': if __name__ == '__main__':
# Ensure the existing index.html file is in the correct location # Ensure the existing index.html file is in the correct location
@@ -357,9 +682,8 @@ if __name__ == '__main__':
if os.path.exists('index.html') and not os.path.exists('templates/index.html'): if os.path.exists('index.html') and not os.path.exists('templates/index.html'):
os.rename('index.html', 'templates/index.html') os.rename('index.html', 'templates/index.html')
# Load models asynchronously before starting the server # Load models before starting the server
print("Starting CPU-only model loading...") print("Starting model loading...")
# In a production environment, you could load models in a separate thread
load_models() load_models()
# Start the server # Start the server

560
Backend/voice-chat.js Normal file
View File

@@ -0,0 +1,560 @@
document.addEventListener('DOMContentLoaded', () => {
// DOM Elements
const startButton = document.getElementById('start-button');
const interruptButton = document.getElementById('interrupt-button');
const conversationDiv = document.getElementById('conversation');
const connectionDot = document.getElementById('connection-dot');
const connectionStatus = document.getElementById('connection-status');
const whisperStatus = document.getElementById('whisper-status');
const csmStatus = document.getElementById('csm-status');
const llmStatus = document.getElementById('llm-status');
const webrtcStatus = document.getElementById('webrtc-status');
const micAnimation = document.getElementById('mic-animation');
const loadingDiv = document.getElementById('loading');
const loadingText = document.getElementById('loading-text');
// State variables
let socket;
let isConnected = false;
let isListening = false;
let isAiSpeaking = false;
let audioContext;
let mediaStream;
let audioRecorder;
let audioProcessor;
const audioChunks = [];
// WebRTC variables
let peerConnection;
let dataChannel;
let hasActiveConnection = false;
// Audio playback
let audioQueue = [];
let isPlaying = false;
// Configuration variables
let serverSampleRate = 24000;
let clientSampleRate = 44100;
let iceServers = [];
// Initialize the application
initApp();
// Main initialization function
function initApp() {
updateConnectionStatus('connecting');
setupSocketConnection();
setupEventListeners();
}
// Set up Socket.IO connection with server
function setupSocketConnection() {
socket = io();
socket.on('connect', () => {
console.log('Connected to server');
updateConnectionStatus('connected');
isConnected = true;
});
socket.on('disconnect', () => {
console.log('Disconnected from server');
updateConnectionStatus('disconnected');
isConnected = false;
cleanupAudio();
cleanupWebRTC();
});
socket.on('session_ready', (data) => {
console.log('Session ready:', data);
updateModelStatus(data);
clientSampleRate = data.client_sample_rate;
serverSampleRate = data.server_sample_rate;
iceServers = data.ice_servers;
// Initialize WebRTC if models are available
if (data.whisper_available && data.llm_available) {
initializeWebRTC();
}
});
socket.on('ready_for_speech', (data) => {
console.log('Ready for speech:', data);
startButton.disabled = false;
addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
});
socket.on('webrtc_signal', (data) => {
handleWebRTCSignal(data);
});
socket.on('transcription', (data) => {
console.log('Transcription:', data);
addUserMessage(data.text);
loadingDiv.style.display = 'none';
});
socket.on('ai_response_text', (data) => {
console.log('AI response text:', data);
addAIMessage(data.text);
loadingDiv.style.display = 'none';
});
socket.on('ai_speech_start', () => {
console.log('AI started speaking');
isAiSpeaking = true;
interruptButton.disabled = false;
});
socket.on('ai_speech_chunk', (data) => {
console.log('Received AI speech chunk');
playAudioChunk(data.audio, data.is_last);
});
socket.on('ai_speech_end', () => {
console.log('AI stopped speaking');
isAiSpeaking = false;
interruptButton.disabled = true;
});
socket.on('user_speech_start', () => {
console.log('User speech detected');
showSpeakingIndicator(true);
});
socket.on('processing_speech', () => {
console.log('Processing speech');
showSpeakingIndicator(false);
showLoadingIndicator('Processing your speech...');
});
socket.on('no_speech_detected', () => {
console.log('No speech detected');
hideLoadingIndicator();
addInfoMessage('No speech detected. Please try again.');
});
socket.on('ai_interrupted', () => {
console.log('AI interrupted');
clearAudioQueue();
isAiSpeaking = false;
interruptButton.disabled = true;
});
socket.on('ai_interrupted_by_user', () => {
console.log('AI interrupted by user');
clearAudioQueue();
isAiSpeaking = false;
interruptButton.disabled = true;
addInfoMessage('AI interrupted by your speech');
});
socket.on('error', (data) => {
console.error('Server error:', data);
hideLoadingIndicator();
addInfoMessage(`Error: ${data.message}`);
});
}
// Set up UI event listeners
function setupEventListeners() {
startButton.addEventListener('click', toggleListening);
interruptButton.addEventListener('click', interruptAI);
}
// Update UI connection status
function updateConnectionStatus(status) {
connectionDot.className = 'status-dot ' + status;
switch (status) {
case 'connected':
connectionStatus.textContent = 'Connected';
break;
case 'connecting':
connectionStatus.textContent = 'Connecting...';
break;
case 'disconnected':
connectionStatus.textContent = 'Disconnected';
startButton.disabled = true;
interruptButton.disabled = true;
break;
}
}
// Update model status indicators
function updateModelStatus(data) {
whisperStatus.textContent = data.whisper_available ? 'Available' : 'Not Available';
whisperStatus.style.color = data.whisper_available ? 'green' : 'red';
csmStatus.textContent = data.csm_available ? 'Available' : 'Not Available';
csmStatus.style.color = data.csm_available ? 'green' : 'red';
llmStatus.textContent = data.llm_available ? 'Available' : 'Not Available';
llmStatus.style.color = data.llm_available ? 'green' : 'red';
}
// Initialize WebRTC connection
function initializeWebRTC() {
if (!isConnected) return;
const configuration = {
iceServers: iceServers
};
peerConnection = new RTCPeerConnection(configuration);
// Create data channel for WebRTC communication
dataChannel = peerConnection.createDataChannel('audioData', {
ordered: true
});
dataChannel.onopen = () => {
console.log('WebRTC data channel open');
hasActiveConnection = true;
webrtcStatus.textContent = 'Connected';
webrtcStatus.style.color = 'green';
socket.emit('webrtc_connected', { status: 'connected' });
};
dataChannel.onclose = () => {
console.log('WebRTC data channel closed');
hasActiveConnection = false;
webrtcStatus.textContent = 'Disconnected';
webrtcStatus.style.color = 'red';
};
// Handle ICE candidates
peerConnection.onicecandidate = (event) => {
if (event.candidate) {
socket.emit('webrtc_signal', {
type: 'ice_candidate',
candidate: event.candidate
});
}
};
// Log ICE connection state changes
peerConnection.oniceconnectionstatechange = () => {
console.log('ICE connection state:', peerConnection.iceConnectionState);
};
// Create offer
peerConnection.createOffer()
.then(offer => peerConnection.setLocalDescription(offer))
.then(() => {
socket.emit('webrtc_signal', {
type: 'offer',
sdp: peerConnection.localDescription
});
})
.catch(error => {
console.error('Error creating WebRTC offer:', error);
webrtcStatus.textContent = 'Failed to Connect';
webrtcStatus.style.color = 'red';
});
}
// Handle WebRTC signals from the server
function handleWebRTCSignal(data) {
if (!peerConnection) return;
if (data.type === 'answer') {
peerConnection.setRemoteDescription(new RTCSessionDescription(data.sdp))
.catch(error => console.error('Error setting remote description:', error));
}
else if (data.type === 'ice_candidate') {
peerConnection.addIceCandidate(new RTCIceCandidate(data.candidate))
.catch(error => console.error('Error adding ICE candidate:', error));
}
}
// Clean up WebRTC connection
function cleanupWebRTC() {
if (dataChannel) {
dataChannel.close();
}
if (peerConnection) {
peerConnection.close();
}
dataChannel = null;
peerConnection = null;
hasActiveConnection = false;
webrtcStatus.textContent = 'Not Connected';
webrtcStatus.style.color = 'red';
}
// Toggle audio listening
function toggleListening() {
if (isListening) {
stopListening();
} else {
startListening();
}
}
// Start listening for audio
async function startListening() {
if (!isConnected) return;
try {
await initAudio();
isListening = true;
startButton.textContent = 'Stop Listening';
startButton.innerHTML = `
<svg class="button-icon" viewBox="0 0 24 24" fill="white">
<path d="M6 6h12v12H6z"></path>
</svg>
Stop Listening
`;
} catch (error) {
console.error('Error starting audio:', error);
addInfoMessage('Error accessing microphone. Please check permissions.');
}
}
// Stop listening for audio
function stopListening() {
cleanupAudio();
isListening = false;
startButton.innerHTML = `
<svg class="button-icon" viewBox="0 0 24 24" fill="white">
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
</svg>
Start Listening
`;
showSpeakingIndicator(false);
}
// Initialize audio capture
async function initAudio() {
// Request microphone access
mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: clientSampleRate,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true
}
});
// Initialize AudioContext
audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: clientSampleRate
});
// Create audio source from stream
const source = audioContext.createMediaStreamSource(mediaStream);
// Create ScriptProcessor for audio processing
const bufferSize = 4096;
audioProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
// Process audio data
audioProcessor.onaudioprocess = (event) => {
if (!isListening || isAiSpeaking) return;
const input = event.inputBuffer.getChannelData(0);
const audioData = convertFloat32ToInt16(input);
sendAudioChunk(audioData);
};
// Connect the nodes
source.connect(audioProcessor);
audioProcessor.connect(audioContext.destination);
}
// Clean up audio resources
function cleanupAudio() {
if (audioProcessor) {
audioProcessor.disconnect();
audioProcessor = null;
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
}
if (audioContext && audioContext.state !== 'closed') {
audioContext.close().catch(error => console.error('Error closing AudioContext:', error));
}
audioChunks.length = 0;
}
// Convert Float32Array to Int16Array for sending to server
function convertFloat32ToInt16(float32Array) {
const int16Array = new Int16Array(float32Array.length);
for (let i = 0; i < float32Array.length; i++) {
// Convert float [-1.0, 1.0] to int16 [-32768, 32767]
int16Array[i] = Math.max(-32768, Math.min(32767, Math.floor(float32Array[i] * 32768)));
}
return int16Array;
}
// Send audio chunk to server
function sendAudioChunk(audioData) {
if (!isConnected || !isListening) return;
// Convert to base64 for transmission
const base64Audio = arrayBufferToBase64(audioData.buffer);
// Send via Socket.IO (could use WebRTC's DataChannel for lower latency in production)
socket.emit('audio_stream', { audio: base64Audio });
}
// Play audio chunk received from server
function playAudioChunk(base64Audio, isLast) {
const audioData = base64ToArrayBuffer(base64Audio);
// Add to queue
audioQueue.push({
data: audioData,
isLast: isLast
});
// Start playing if not already playing
if (!isPlaying) {
playNextAudioChunk();
}
}
// Play the next audio chunk in the queue
function playNextAudioChunk() {
if (audioQueue.length === 0) {
isPlaying = false;
return;
}
isPlaying = true;
const chunk = audioQueue.shift();
try {
// Create audio context if needed
if (!audioContext || audioContext.state === 'closed') {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
}
// Resume audio context if suspended
if (audioContext.state === 'suspended') {
audioContext.resume();
}
// Decode the WAV data
audioContext.decodeAudioData(chunk.data, (buffer) => {
const source = audioContext.createBufferSource();
source.buffer = buffer;
source.connect(audioContext.destination);
// When playback ends, play the next chunk
source.onended = () => {
playNextAudioChunk();
};
source.start(0);
// If it's the last chunk, update UI
if (chunk.isLast) {
setTimeout(() => {
isAiSpeaking = false;
interruptButton.disabled = true;
}, buffer.duration * 1000);
}
}, (error) => {
console.error('Error decoding audio data:', error);
playNextAudioChunk(); // Skip this chunk and try the next
});
} catch (error) {
console.error('Error playing audio chunk:', error);
playNextAudioChunk(); // Try the next chunk
}
}
// Clear the audio queue (used when interrupting)
function clearAudioQueue() {
audioQueue.length = 0;
isPlaying = false;
// Stop any currently playing audio
if (audioContext) {
audioContext.suspend();
}
}
// Send interrupt signal to server
function interruptAI() {
if (!isConnected || !isAiSpeaking) return;
socket.emit('interrupt_ai');
clearAudioQueue();
}
// Convert ArrayBuffer to Base64 string
function arrayBufferToBase64(buffer) {
const binary = new Uint8Array(buffer);
let base64 = '';
const len = binary.byteLength;
for (let i = 0; i < len; i++) {
base64 += String.fromCharCode(binary[i]);
}
return window.btoa(base64);
}
// Convert Base64 string to ArrayBuffer
function base64ToArrayBuffer(base64) {
const binaryString = window.atob(base64);
const len = binaryString.length;
const bytes = new Uint8Array(len);
for (let i = 0; i < len; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
return bytes.buffer;
}
// Add user message to conversation
function addUserMessage(text) {
const messageDiv = document.createElement('div');
messageDiv.className = 'message user-message';
messageDiv.textContent = text;
conversationDiv.appendChild(messageDiv);
conversationDiv.scrollTop = conversationDiv.scrollHeight;
}
// Add AI message to conversation
function addAIMessage(text) {
const messageDiv = document.createElement('div');
messageDiv.className = 'message ai-message';
messageDiv.textContent = text;
conversationDiv.appendChild(messageDiv);
conversationDiv.scrollTop = conversationDiv.scrollHeight;
}
// Add info message to conversation
function addInfoMessage(text) {
const messageDiv = document.createElement('div');
messageDiv.className = 'info-message';
messageDiv.textContent = text;
conversationDiv.appendChild(messageDiv);
conversationDiv.scrollTop = conversationDiv.scrollHeight;
}
// Show/hide speaking indicator
function showSpeakingIndicator(show) {
micAnimation.style.display = show ? 'flex' : 'none';
}
// Show loading indicator
function showLoadingIndicator(text) {
loadingText.textContent = text || 'Processing...';
loadingDiv.style.display = 'block';
}
// Hide loading indicator
function hideLoadingIndicator() {
loadingDiv.style.display = 'none';
}
});

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

View File

@@ -13,8 +13,8 @@ const geistMono = Geist_Mono({
}); });
export const metadata: Metadata = { export const metadata: Metadata = {
title: "Create Next App", title: "Fauxcall",
description: "Generated by create next app", description: "Fauxcall is a fake call app that helps you get out of awkward situations.",
}; };
export default function RootLayout({ export default function RootLayout({

25
React/src/app/manifest.ts Normal file
View File

@@ -0,0 +1,25 @@
import type { MetadataRoute } from 'next'
export default function manifest(): MetadataRoute.Manifest {
return {
name: 'Fauxcall',
short_name: 'Fauxcall',
description: 'A fake call app that helps you get out of awkward and dangerous situations.',
start_url: '/',
display: 'standalone',
background_color: '#ffffff',
theme_color: '#000000',
icons: [
{
src: '/icon-192x192.png',
sizes: '192x192',
type: 'image/png',
},
{
src: '/icon-512x512.png',
sizes: '512x512',
type: 'image/png',
},
],
}
}

View File

@@ -4,7 +4,7 @@ import { useRouter } from "next/navigation";
import './styles.css'; import './styles.css';
export default function Home() { export default function Home() {
const [contacts, setContacts] = useState<string[]>([]); const [contacts, setContacts] = useState<string[]>([""]);
const [codeword, setCodeword] = useState(""); const [codeword, setCodeword] = useState("");
const [session, setSession] = useState<any>(null); const [session, setSession] = useState<any>(null);
const [loading, setLoading] = useState(true); const [loading, setLoading] = useState(true);
@@ -26,6 +26,16 @@ export default function Home() {
}); });
}, []); }, []);
const handleInputChange = (index: number, value: string) => {
const updatedContacts = [...contacts];
updatedContacts[index] = value; // Update the specific input value
setContacts(updatedContacts);
};
const addContactInput = () => {
setContacts([...contacts, ""]); // Add a new empty input
};
function saveToDB() { function saveToDB() {
alert("Saving contacts..."); alert("Saving contacts...");
const contactInputs = document.querySelectorAll( const contactInputs = document.querySelectorAll(
@@ -144,27 +154,20 @@ export default function Home() {
className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start" className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start"
onSubmit={(e) => e.preventDefault()} onSubmit={(e) => e.preventDefault()}
> >
{contacts.map((contact, index) => (
<input <input
key={index}
type="text" type="text"
value={contacts} value={contact}
onChange={(e) => setContacts(e.target.value.split(","))} onChange={(e) => handleInputChange(index, e.target.value)}
placeholder="Write down an emergency contact" placeholder={`Contact ${index + 1}`}
className="border border-gray-300 rounded-md p-2" className="border border-gray-300 rounded-md p-2"
/> />
))}
<button <button
onClick={() => { onClick={addContactInput}
alert("Adding contact..."); className="bg-emerald-500 text-white
let elem = document.getElementsByClassName( font-semibold font-lg rounded-md p-2"
"text-input"
)[0] as HTMLElement;
console.log("Element:", elem);
let d = elem.cloneNode(true) as HTMLElement;
document.getElementById("Contacts")?.appendChild(d);
}}
className="bg-emerald-500 text-fuchsia-300"
type="button" type="button"
> >
Add Add