This commit is contained in:
idler-wheel
2025-03-30 09:27:10 -04:00
8 changed files with 1356 additions and 632 deletions

View File

@@ -3,454 +3,266 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Voice Assistant - CSM & Whisper</title>
<script src="https://cdn.socket.io/4.6.0/socket.io.min.js"></script>
<title>AI Voice Chat</title>
<style>
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin: 0;
padding: 0;
background-color: #f5f5f5;
color: #333;
}
.container {
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f5f7fa;
color: #333;
}
h1 {
color: #2c3e50;
header {
text-align: center;
margin-bottom: 30px;
}
#conversation {
height: 400px;
border: 1px solid #ddd;
border-radius: 10px;
padding: 20px;
margin-bottom: 20px;
overflow-y: auto;
background-color: white;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
h1 {
color: #2c3e50;
}
.message-container {
display: flex;
flex-direction: column;
margin-bottom: 15px;
}
.user-message-container {
align-items: flex-end;
}
.bot-message-container {
align-items: flex-start;
}
.message {
max-width: 80%;
padding: 12px;
border-radius: 18px;
position: relative;
word-break: break-word;
}
.user-message {
background-color: #dcf8c6;
color: #000;
border-bottom-right-radius: 4px;
}
.bot-message {
background-color: #f1f0f0;
color: #000;
border-bottom-left-radius: 4px;
}
.message-label {
font-size: 0.8em;
margin-bottom: 4px;
color: #657786;
}
#controls {
display: flex;
gap: 10px;
justify-content: center;
margin-bottom: 15px;
}
button {
padding: 12px 24px;
font-size: 16px;
cursor: pointer;
border-radius: 50px;
border: none;
outline: none;
transition: all 0.3s ease;
}
#recordButton {
background-color: #4CAF50;
.status-bar {
background-color: #2c3e50;
color: white;
width: 200px;
box-shadow: 0 4px 8px rgba(76, 175, 80, 0.3);
padding: 10px;
border-radius: 5px;
margin-bottom: 20px;
display: flex;
justify-content: space-between;
align-items: center;
}
#recordButton:hover {
background-color: #45a049;
transform: translateY(-2px);
.status-indicator {
display: flex;
align-items: center;
}
.status-dot {
height: 10px;
width: 10px;
border-radius: 50%;
margin-right: 8px;
}
.status-dot.connected { background-color: #2ecc71; }
.status-dot.connecting { background-color: #f39c12; }
.status-dot.disconnected { background-color: #e74c3c; }
#recordButton.recording {
background-color: #f44336;
.conversation {
background-color: white;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
height: 400px;
padding: 20px;
overflow-y: auto;
margin-bottom: 20px;
}
.message {
margin-bottom: 15px;
padding: 10px 15px;
border-radius: 18px;
max-width: 80%;
word-wrap: break-word;
}
.user-message {
background-color: #e3f2fd;
margin-left: auto;
border-bottom-right-radius: 5px;
}
.ai-message {
background-color: #f0f0f0;
margin-right: auto;
border-bottom-left-radius: 5px;
}
.controls {
display: flex;
justify-content: center;
gap: 15px;
margin-bottom: 20px;
}
button {
background-color: #2c3e50;
color: white;
border: none;
padding: 12px 24px;
border-radius: 25px;
cursor: pointer;
font-size: 16px;
transition: all 0.2s;
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
}
button:hover {
background-color: #1a252f;
}
button:disabled {
background-color: #95a5a6;
cursor: not-allowed;
}
.button-icon {
width: 20px;
height: 20px;
}
.mic-animation {
width: 60px;
height: 60px;
border-radius: 50%;
background-color: rgba(231, 76, 60, 0.2);
display: flex;
align-items: center;
justify-content: center;
animation: pulse 1.5s infinite;
box-shadow: 0 4px 8px rgba(244, 67, 54, 0.3);
margin: 0 auto 15px;
}
@keyframes pulse {
0% {
transform: scale(1);
transform: scale(0.95);
box-shadow: 0 0 0 0 rgba(231, 76, 60, 0.5);
}
50% {
transform: scale(1.05);
70% {
transform: scale(1);
box-shadow: 0 0 0 15px rgba(231, 76, 60, 0);
}
100% {
transform: scale(1);
transform: scale(0.95);
box-shadow: 0 0 0 0 rgba(231, 76, 60, 0);
}
}
#status {
.settings {
margin-top: 20px;
padding: 15px;
background-color: white;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.settings h3 {
margin-top: 0;
color: #2c3e50;
border-bottom: 1px solid #eee;
padding-bottom: 10px;
}
.settings-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 15px;
}
.setting-item {
padding: 10px;
background-color: #f9f9f9;
border-radius: 5px;
}
.audio-visualizer {
height: 50px;
width: 100%;
background-color: #f0f0f0;
margin-top: 10px;
border-radius: 5px;
overflow: hidden;
}
.info-message {
text-align: center;
margin-top: 15px;
color: #7f8c8d;
margin: 10px 0;
font-style: italic;
color: #657786;
}
.audio-wave {
display: flex;
justify-content: center;
align-items: center;
height: 40px;
gap: 3px;
.loading {
text-align: center;
margin: 20px 0;
}
.audio-wave span {
display: block;
width: 3px;
height: 100%;
background-color: #4CAF50;
animation: wave 1.5s infinite ease-in-out;
border-radius: 6px;
}
.audio-wave span:nth-child(2) {
animation-delay: 0.2s;
}
.audio-wave span:nth-child(3) {
animation-delay: 0.4s;
}
.audio-wave span:nth-child(4) {
animation-delay: 0.6s;
}
.audio-wave span:nth-child(5) {
animation-delay: 0.8s;
}
@keyframes wave {
0%, 100% {
height: 8px;
}
50% {
.spinner {
border: 4px solid rgba(0, 0, 0, 0.1);
border-radius: 50%;
border-top: 4px solid #2c3e50;
width: 30px;
height: 30px;
animation: spin 1s linear infinite;
margin: 0 auto 10px;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.hidden {
display: none;
}
.transcription-info {
font-size: 0.8em;
color: #888;
margin-top: 4px;
text-align: right;
footer {
text-align: center;
margin-top: 30px;
padding: 20px;
color: #7f8c8d;
font-size: 14px;
}
</style>
</head>
<body>
<h1>Voice Assistant with CSM & Whisper</h1>
<div id="conversation"></div>
<div class="container">
<header>
<h1>AI Voice Assistant</h1>
</header>
<div id="controls">
<button id="recordButton">Hold to Speak</button>
<div class="status-bar">
<div class="status-indicator">
<div class="status-dot disconnected" id="connection-dot"></div>
<span id="connection-status">Disconnected</span>
</div>
<div id="runtime-info">
<span id="models-status"></span>
</div>
</div>
<div id="audioWave" class="audio-wave hidden">
<span></span>
<span></span>
<span></span>
<span></span>
<span></span>
<div class="conversation" id="conversation">
<div class="info-message">Your conversation will appear here.</div>
</div>
<div id="status">Connecting to server...</div>
<div id="mic-animation" class="mic-animation" style="display: none;">
<svg width="24" height="24" viewBox="0 0 24 24" fill="white">
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
</svg>
</div>
<script>
const socket = io();
const recordButton = document.getElementById('recordButton');
const conversation = document.getElementById('conversation');
const status = document.getElementById('status');
const audioWave = document.getElementById('audioWave');
<div class="controls">
<button id="start-button" disabled>
<svg class="button-icon" viewBox="0 0 24 24" fill="white">
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
</svg>
Start Listening
</button>
<button id="interrupt-button" disabled>
<svg class="button-icon" viewBox="0 0 24 24" fill="white">
<path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm1 15h-2v-2h2v2zm0-4h-2V7h2v6z"></path>
</svg>
Interrupt
</button>
</div>
let mediaRecorder;
let audioChunks = [];
let isRecording = false;
let audioSendInterval;
let sessionActive = false;
<div id="loading" class="loading" style="display: none;">
<div class="spinner"></div>
<p id="loading-text">Processing your speech...</p>
</div>
// Initialize audio context
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
<div class="settings">
<h3>Status</h3>
<div class="settings-grid">
<div class="setting-item">
<div><strong>Whisper Model:</strong> <span id="whisper-status">Loading...</span></div>
</div>
<div class="setting-item">
<div><strong>CSM Audio Model:</strong> <span id="csm-status">Loading...</span></div>
</div>
<div class="setting-item">
<div><strong>LLM Model:</strong> <span id="llm-status">Loading...</span></div>
</div>
<div class="setting-item">
<div><strong>WebRTC:</strong> <span id="webrtc-status">Not Connected</span></div>
</div>
</div>
</div>
</div>
// Connect to server
socket.on('connect', () => {
status.textContent = 'Connected to server';
sessionActive = true;
});
<footer>
<p>AI Voice Assistant | Using Fast Whisper, Llama 3.2, and CSM Audio Models</p>
</footer>
socket.on('disconnect', () => {
status.textContent = 'Disconnected from server';
sessionActive = false;
});
socket.on('ready', (data) => {
status.textContent = data.message;
setupAudioRecording();
});
socket.on('transcription', (data) => {
addMessage('user', data.text);
status.textContent = 'Assistant is thinking...';
});
socket.on('audio_response', (data) => {
// Play audio
status.textContent = 'Playing response...';
const audio = new Audio('data:audio/wav;base64,' + data.audio);
audio.onended = () => {
status.textContent = 'Ready to record';
};
audio.onerror = () => {
status.textContent = 'Error playing audio';
console.error('Error playing audio response');
};
audio.play().catch(err => {
status.textContent = 'Error playing audio: ' + err.message;
console.error('Error playing audio:', err);
});
// Display text
addMessage('bot', data.text);
});
socket.on('error', (data) => {
status.textContent = 'Error: ' + data.message;
console.error('Server error:', data.message);
});
function setupAudioRecording() {
// Check if browser supports required APIs
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
status.textContent = 'Your browser does not support audio recording';
return;
}
// Get user media
navigator.mediaDevices.getUserMedia({ audio: true })
.then(stream => {
// Setup recording with better audio quality
const options = {
mimeType: 'audio/webm',
audioBitsPerSecond: 128000
};
try {
mediaRecorder = new MediaRecorder(stream, options);
} catch (e) {
// Fallback if the specified options aren't supported
mediaRecorder = new MediaRecorder(stream);
}
mediaRecorder.ondataavailable = event => {
if (event.data.size > 0) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = () => {
processRecording();
};
// Create audio analyzer for visualization
const source = audioContext.createMediaStreamSource(stream);
const analyzer = audioContext.createAnalyser();
analyzer.fftSize = 2048;
source.connect(analyzer);
// Setup button handlers with better touch handling
recordButton.addEventListener('mousedown', startRecording);
recordButton.addEventListener('touchstart', (e) => {
e.preventDefault(); // Prevent default touch behavior
startRecording();
});
recordButton.addEventListener('mouseup', stopRecording);
recordButton.addEventListener('touchend', (e) => {
e.preventDefault();
stopRecording();
});
recordButton.addEventListener('mouseleave', stopRecording);
status.textContent = 'Ready to record';
})
.catch(err => {
status.textContent = 'Error accessing microphone: ' + err.message;
console.error('Error accessing microphone:', err);
});
}
function startRecording() {
if (!isRecording && sessionActive) {
audioChunks = [];
mediaRecorder.start(100); // Collect data in 100ms chunks
recordButton.classList.add('recording');
recordButton.textContent = 'Release to Stop';
status.textContent = 'Recording...';
audioWave.classList.remove('hidden');
isRecording = true;
socket.emit('start_speaking');
// Start sending audio chunks periodically
audioSendInterval = setInterval(() => {
if (mediaRecorder.state === 'recording') {
mediaRecorder.requestData(); // Force ondataavailable to fire
}
}, 300); // Send every 300ms
}
}
function stopRecording() {
if (isRecording) {
clearInterval(audioSendInterval);
mediaRecorder.stop();
recordButton.classList.remove('recording');
recordButton.textContent = 'Hold to Speak';
status.textContent = 'Processing speech...';
audioWave.classList.add('hidden');
isRecording = false;
}
}
function processRecording() {
if (audioChunks.length === 0) {
status.textContent = 'No audio recorded';
return;
}
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
// Convert to ArrayBuffer for processing
const fileReader = new FileReader();
fileReader.onloadend = () => {
try {
const arrayBuffer = fileReader.result;
// Convert to Float32Array - this works better with WebAudio API
const audioData = convertToFloat32(arrayBuffer);
// Convert to base64 for sending
const base64String = arrayBufferToBase64(audioData.buffer);
socket.emit('audio_chunk', { audio: base64String });
// Signal end of speech
socket.emit('stop_speaking');
} catch (e) {
console.error('Error processing audio:', e);
status.textContent = 'Error processing audio';
}
};
fileReader.onerror = () => {
status.textContent = 'Error reading audio data';
};
fileReader.readAsArrayBuffer(audioBlob);
}
function convertToFloat32(arrayBuffer) {
// Get raw audio data as Int16 (common format for audio)
const int16Array = new Int16Array(arrayBuffer);
// Convert to Float32 (normalize between -1 and 1)
const float32Array = new Float32Array(int16Array.length);
for (let i = 0; i < int16Array.length; i++) {
float32Array[i] = int16Array[i] / 32768.0;
}
return float32Array;
}
function addMessage(sender, text) {
const containerDiv = document.createElement('div');
containerDiv.className = sender === 'user' ? 'message-container user-message-container' : 'message-container bot-message-container';
const labelDiv = document.createElement('div');
labelDiv.className = 'message-label';
labelDiv.textContent = sender === 'user' ? 'You' : 'Assistant';
containerDiv.appendChild(labelDiv);
const messageDiv = document.createElement('div');
messageDiv.className = sender === 'user' ? 'message user-message' : 'message bot-message';
messageDiv.textContent = text;
containerDiv.appendChild(messageDiv);
if (sender === 'user') {
const infoDiv = document.createElement('div');
infoDiv.className = 'transcription-info';
infoDiv.textContent = 'Transcribed with Whisper';
containerDiv.appendChild(infoDiv);
}
conversation.appendChild(containerDiv);
conversation.scrollTop = conversation.scrollHeight;
}
function arrayBufferToBase64(buffer) {
let binary = '';
const bytes = new Uint8Array(buffer);
const len = bytes.byteLength;
for (let i = 0; i < len; i++) {
binary += String.fromCharCode(bytes[i]);
}
return window.btoa(binary);
}
// Handle page visibility change to avoid issues with background tabs
document.addEventListener('visibilitychange', () => {
if (document.hidden && isRecording) {
stopRecording();
}
});
// Clean disconnection when page is closed
window.addEventListener('beforeunload', () => {
if (socket && socket.connected) {
socket.disconnect();
}
});
</script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.6.1/socket.io.min.js"></script>
<script src="./voice-chat.js"></script>
</body>
</html>

View File

@@ -8,15 +8,14 @@ import numpy as np
from flask import Flask, render_template, request
from flask_socketio import SocketIO, emit
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import deque
import threading
import queue
import requests
import huggingface_hub
from generator import load_csm_1b, Segment
# Force CPU mode regardless of what's available
# This bypasses the CUDA/cuDNN library requirements
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Hide all CUDA devices
torch.backends.cudnn.enabled = False # Disable cuDNN
from collections import deque
import json
import webrtcvad # For voice activity detection
# Configure environment with longer timeouts
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads
@@ -27,28 +26,92 @@ os.makedirs("models", exist_ok=True)
app = Flask(__name__)
app.config['SECRET_KEY'] = 'your-secret-key'
socketio = SocketIO(app, cors_allowed_origins="*")
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
# Force CPU regardless of what hardware is available
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_compute_type = "int8"
print(f"Forcing CPU mode for all models")
# Explicitly check for CUDA and print detailed info
print("\n=== CUDA Information ===")
if torch.cuda.is_available():
print(f"CUDA is available")
print(f"CUDA version: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
print("CUDA is not available")
# Check for cuDNN
try:
import ctypes
ctypes.CDLL("libcudnn_ops_infer.so.8")
print("cuDNN is available")
except:
print("cuDNN is not available (libcudnn_ops_infer.so.8 not found)")
# Determine compute device
try:
if torch.cuda.is_available():
device = "cuda"
whisper_compute_type = "float16"
print("🟢 CUDA is available and initialized successfully")
elif torch.backends.mps.is_available():
device = "mps"
whisper_compute_type = "float32"
print("🟢 MPS is available (Apple Silicon)")
else:
device = "cpu"
whisper_compute_type = "int8"
print("🟡 Using CPU (CUDA/MPS not available)")
except Exception as e:
print(f"🔴 Error initializing CUDA: {e}")
print("🔴 Falling back to CPU")
device = "cpu"
whisper_compute_type = "int8"
print(f"Using device: {device}")
# Initialize models with proper error handling
whisper_model = None
csm_generator = None
llm_model = None
llm_tokenizer = None
vad = None
# Constants
SAMPLE_RATE = 16000 # For VAD
VAD_FRAME_SIZE = 480 # 30ms at 16kHz for VAD
VAD_MODE = 3 # Aggressive mode for better results
AUDIO_CHUNK_SIZE = 2400 # 100ms chunks when streaming AI voice
# Audio sample rates
CLIENT_SAMPLE_RATE = 44100 # Browser WebAudio default
WHISPER_SAMPLE_RATE = 16000 # Whisper expects 16kHz
# Session data structures
user_sessions = {} # session_id -> complete session data
# WebRTC ICE servers (STUN/TURN servers for NAT traversal)
ICE_SERVERS = [
{"urls": "stun:stun.l.google.com:19302"},
{"urls": "stun:stun1.l.google.com:19302"}
]
def load_models():
global whisper_model, csm_generator, llm_model, llm_tokenizer
"""Load all necessary models"""
global whisper_model, csm_generator, llm_model, llm_tokenizer, vad
# Initialize Voice Activity Detector
try:
vad = webrtcvad.Vad(VAD_MODE)
print("Voice Activity Detector initialized")
except Exception as e:
print(f"Error initializing VAD: {e}")
vad = None
# Initialize Faster-Whisper for transcription
try:
print("Loading Whisper model on CPU...")
# Import here to avoid immediate import errors if package is missing
print("Loading Whisper model...")
from faster_whisper import WhisperModel
whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8", download_root="./models/whisper")
whisper_model = WhisperModel("base", device=device, compute_type=whisper_compute_type, download_root="./models/whisper")
print("Whisper model loaded successfully")
except Exception as e:
print(f"Error loading Whisper model: {e}")
@@ -56,8 +119,8 @@ def load_models():
# Initialize CSM model for audio generation
try:
print("Loading CSM model on CPU...")
csm_generator = load_csm_1b(device="cpu")
print("Loading CSM model...")
csm_generator = load_csm_1b(device=device)
print("CSM model loaded successfully")
except Exception as e:
print(f"Error loading CSM model: {e}")
@@ -65,13 +128,14 @@ def load_models():
# Initialize Llama 3.2 model for response generation
try:
print("Loading Llama 3.2 model on CPU...")
llm_model_id = "meta-llama/Llama-3.2-1B" # Choose appropriate size based on resources
print("Loading Llama 3.2 model...")
llm_model_id = "meta-llama/Llama-3.2-1B"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama")
dtype = torch.bfloat16 if device != "cpu" else torch.float32
llm_model = AutoModelForCausalLM.from_pretrained(
llm_model_id,
torch_dtype=torch.float32, # Use float32 on CPU
device_map="cpu",
torch_dtype=dtype,
device_map=device,
cache_dir="./models/llama",
low_cpu_mem_usage=True
)
@@ -80,168 +144,344 @@ def load_models():
print(f"Error loading Llama 3.2 model: {e}")
print("Will use a fallback response generation method")
# Store conversation context
conversation_context = {} # session_id -> context
@app.route('/')
def index():
"""Serve the main interface"""
return render_template('index.html')
@app.route('/voice-chat.js')
def voice_chat_js():
"""Serve the JavaScript for voice chat"""
return app.send_static_file('voice-chat.js')
@socketio.on('connect')
def handle_connect():
print(f"Client connected: {request.sid}")
conversation_context[request.sid] = {
"""Handle new client connection"""
session_id = request.sid
print(f"Client connected: {session_id}")
# Initialize session data
user_sessions[session_id] = {
# Conversation context
'segments': [],
'speakers': [0, 1], # 0 = user, 1 = bot
'audio_buffer': deque(maxlen=10), # Store recent audio chunks
'is_speaking': False,
'silence_start': None
'conversation_history': [],
'is_turn_active': False,
# Audio buffers and state
'vad_buffer': deque(maxlen=30), # ~1s of audio at 30fps
'audio_buffer': bytearray(),
'is_user_speaking': False,
'last_vad_active': time.time(),
'silence_duration': 0,
'speech_frames': 0,
# AI state
'is_ai_speaking': False,
'should_interrupt_ai': False,
'ai_stream_queue': queue.Queue(),
# WebRTC status
'webrtc_connected': False,
'webrtc_peer_id': None,
# Processing flags
'is_processing': False,
'pending_user_audio': None
}
emit('ready', {'message': 'Connection established'})
# Send config to client
emit('session_ready', {
'whisper_available': whisper_model is not None,
'csm_available': csm_generator is not None,
'llm_available': llm_model is not None,
'client_sample_rate': CLIENT_SAMPLE_RATE,
'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
'ice_servers': ICE_SERVERS
})
@socketio.on('disconnect')
def handle_disconnect():
print(f"Client disconnected: {request.sid}")
if request.sid in conversation_context:
del conversation_context[request.sid]
"""Handle client disconnection"""
session_id = request.sid
print(f"Client disconnected: {session_id}")
@socketio.on('start_speaking')
def handle_start_speaking():
if request.sid in conversation_context:
conversation_context[request.sid]['is_speaking'] = True
conversation_context[request.sid]['audio_buffer'].clear()
print(f"User {request.sid} started speaking")
# Clean up resources
if session_id in user_sessions:
# Signal any running threads to stop
user_sessions[session_id]['should_interrupt_ai'] = True
@socketio.on('audio_chunk')
def handle_audio_chunk(data):
if request.sid not in conversation_context:
# Clean up resources
del user_sessions[session_id]
@socketio.on('webrtc_signal')
def handle_webrtc_signal(data):
"""Handle WebRTC signaling for P2P connection establishment"""
session_id = request.sid
if session_id not in user_sessions:
return
context = conversation_context[request.sid]
# Simply relay the signal to the client
# In a multi-user app, we would route this to the correct peer
emit('webrtc_signal', data)
@socketio.on('webrtc_connected')
def handle_webrtc_connected(data):
"""Client notifies that WebRTC connection is established"""
session_id = request.sid
if session_id not in user_sessions:
return
user_sessions[session_id]['webrtc_connected'] = True
print(f"WebRTC connected for session {session_id}")
emit('ready_for_speech', {'message': 'Ready to start conversation'})
@socketio.on('audio_stream')
def handle_audio_stream(data):
"""Process incoming audio stream packets from client"""
session_id = request.sid
if session_id not in user_sessions:
return
session = user_sessions[session_id]
try:
# Decode audio data
audio_data = base64.b64decode(data['audio'])
audio_numpy = np.frombuffer(audio_data, dtype=np.float32)
audio_tensor = torch.tensor(audio_numpy)
# Add to buffer
context['audio_buffer'].append(audio_tensor)
# Check for silence to detect end of speech
if context['is_speaking'] and is_silence(audio_tensor):
if context['silence_start'] is None:
context['silence_start'] = time.time()
elif time.time() - context['silence_start'] > 1.0: # 1 second of silence
# Process the complete utterance
process_user_utterance(request.sid)
else:
context['silence_start'] = None
@socketio.on('stop_speaking')
def handle_stop_speaking():
if request.sid in conversation_context:
conversation_context[request.sid]['is_speaking'] = False
process_user_utterance(request.sid)
print(f"User {request.sid} stopped speaking")
def is_silence(audio_tensor, threshold=0.02):
"""Check if an audio chunk is silence based on amplitude threshold"""
return torch.mean(torch.abs(audio_tensor)) < threshold
def process_user_utterance(session_id):
"""Process completed user utterance, generate response and send audio back"""
context = conversation_context[session_id]
if not context['audio_buffer']:
audio_bytes = base64.b64decode(data.get('audio', ''))
if not audio_bytes or len(audio_bytes) < 2: # Need at least one sample
return
# Combine audio chunks
full_audio = torch.cat(list(context['audio_buffer']), dim=0)
context['audio_buffer'].clear()
context['is_speaking'] = False
context['silence_start'] = None
# Add to current audio buffer
session['audio_buffer'] += audio_bytes
# Save audio to temporary WAV file for transcription
# Check for speech using VAD
has_speech = detect_speech(audio_bytes, session_id)
# Handle speech state machine
if has_speech:
# Reset silence tracking when speech is detected
session['last_vad_active'] = time.time()
session['silence_duration'] = 0
session['speech_frames'] += 1
# If not already marked as speaking and we have enough speech frames
if not session['is_user_speaking'] and session['speech_frames'] >= 5:
on_speech_started(session_id)
else:
# No speech detected in this frame
if session['is_user_speaking']:
# Calculate silence duration
now = time.time()
session['silence_duration'] = now - session['last_vad_active']
# If silent for more than 0.5 seconds, end speech segment
if session['silence_duration'] > 0.8 and session['speech_frames'] > 8:
on_speech_ended(session_id)
else:
# Not speaking and no speech, just a silent frame
session['speech_frames'] = max(0, session['speech_frames'] - 1)
except Exception as e:
print(f"Error processing audio stream: {e}")
def detect_speech(audio_bytes, session_id):
"""Use VAD to check if audio contains speech"""
if session_id not in user_sessions:
return False
session = user_sessions[session_id]
# Store in VAD buffer for history
session['vad_buffer'].append(audio_bytes)
if vad is None:
# Fallback to simple energy detection
audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
energy = np.mean(np.abs(audio_data)) / 32768.0
return energy > 0.015 # Simple threshold
try:
# Ensure we have the right amount of data for VAD
audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
# If we have too much data, use just the right amount
if len(audio_data) >= VAD_FRAME_SIZE:
frame = audio_data[:VAD_FRAME_SIZE].tobytes()
return vad.is_speech(frame, SAMPLE_RATE)
# If too little data, accumulate in the VAD buffer and check periodically
if len(session['vad_buffer']) >= 3:
# Combine recent chunks to get enough data
combined = bytearray()
for chunk in list(session['vad_buffer'])[-3:]:
combined.extend(chunk)
# Extract the right amount of data
if len(combined) >= VAD_FRAME_SIZE:
frame = combined[:VAD_FRAME_SIZE]
return vad.is_speech(bytes(frame), SAMPLE_RATE)
return False
except Exception as e:
print(f"VAD error: {e}")
return False
def on_speech_started(session_id):
"""Handle start of user speech"""
if session_id not in user_sessions:
return
session = user_sessions[session_id]
# Reset audio buffer
session['audio_buffer'] = bytearray()
session['is_user_speaking'] = True
session['is_turn_active'] = True
# If AI is speaking, we need to interrupt it
if session['is_ai_speaking']:
session['should_interrupt_ai'] = True
emit('ai_interrupted_by_user', room=session_id)
# Notify client that we detected speech
emit('user_speech_start', room=session_id)
def on_speech_ended(session_id):
"""Handle end of user speech segment"""
if session_id not in user_sessions:
return
session = user_sessions[session_id]
# Mark as not speaking anymore
session['is_user_speaking'] = False
session['speech_frames'] = 0
# If no audio or already processing, skip
if len(session['audio_buffer']) < 4000 or session['is_processing']: # At least 250ms of audio
session['audio_buffer'] = bytearray()
return
# Mark as processing to prevent multiple processes
session['is_processing'] = True
# Create a copy of the audio buffer
audio_copy = session['audio_buffer']
session['audio_buffer'] = bytearray()
# Convert audio to the format needed for processing
try:
# Convert to float32 between -1 and 1
audio_np = np.frombuffer(audio_copy, dtype=np.int16).astype(np.float32) / 32768.0
audio_tensor = torch.from_numpy(audio_np)
# Resample to Whisper's expected sample rate if necessary
if CLIENT_SAMPLE_RATE != WHISPER_SAMPLE_RATE:
audio_tensor = torchaudio.functional.resample(
audio_tensor,
orig_freq=CLIENT_SAMPLE_RATE,
new_freq=WHISPER_SAMPLE_RATE
)
# Save as WAV for transcription
temp_audio_path = f"temp_audio_{session_id}.wav"
torchaudio.save(
temp_audio_path,
full_audio.unsqueeze(0),
44100 # Assuming 44.1kHz from client
audio_tensor.unsqueeze(0),
WHISPER_SAMPLE_RATE
)
try:
# Try using Whisper first if available
if whisper_model is not None:
user_text = transcribe_with_whisper(temp_audio_path)
else:
# Fallback to Google's speech recognition
user_text = transcribe_with_google(temp_audio_path)
# Start transcription and response process in a thread
threading.Thread(
target=process_user_utterance,
args=(session_id, temp_audio_path, audio_tensor),
daemon=True
).start()
if not user_text:
print("No speech detected.")
emit('error', {'message': 'No speech detected. Please try again.'}, room=session_id)
# Notify client that processing has started
emit('processing_speech', room=session_id)
except Exception as e:
print(f"Error preparing audio: {e}")
session['is_processing'] = False
emit('error', {'message': f'Error processing audio: {str(e)}'}, room=session_id)
def process_user_utterance(session_id, audio_path, audio_tensor):
"""Process user utterance, transcribe and generate response"""
if session_id not in user_sessions:
return
session = user_sessions[session_id]
try:
# Transcribe audio
if whisper_model is not None:
user_text = transcribe_with_whisper(audio_path)
else:
# Fallback to another transcription service
user_text = transcribe_fallback(audio_path)
# Clean up temp file
if os.path.exists(audio_path):
os.remove(audio_path)
# Check if we got meaningful text
if not user_text or len(user_text.strip()) < 2:
emit('no_speech_detected', room=session_id)
session['is_processing'] = False
return
print(f"Transcribed: {user_text}")
# Add to conversation segments
# Create user segment
user_segment = Segment(
text=user_text,
speaker=0, # User is speaker 0
audio=full_audio
audio=audio_tensor
)
context['segments'].append(user_segment)
session['segments'].append(user_segment)
# Generate bot response
bot_response = generate_llm_response(user_text, context['segments'])
print(f"Bot response: {bot_response}")
# Update conversation history
session['conversation_history'].append({
'role': 'user',
'text': user_text
})
# Send transcribed text to client
# Send transcription to client
emit('transcription', {'text': user_text}, room=session_id)
# Generate and send audio response if CSM is available
# Generate AI response
ai_response = generate_ai_response(user_text, session_id)
# Send text response to client
emit('ai_response_text', {'text': ai_response}, room=session_id)
# Update conversation history
session['conversation_history'].append({
'role': 'assistant',
'text': ai_response
})
# Generate voice response if CSM is available
if csm_generator is not None:
# Convert to audio using CSM
bot_audio = generate_audio_response(bot_response, context['segments'])
session['is_ai_speaking'] = True
session['should_interrupt_ai'] = False
# Convert audio to base64 for sending over websocket
audio_bytes = io.BytesIO()
torchaudio.save(audio_bytes, bot_audio.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav")
audio_bytes.seek(0)
audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
# Add bot response to conversation history
bot_segment = Segment(
text=bot_response,
speaker=1, # Bot is speaker 1
audio=bot_audio
)
context['segments'].append(bot_segment)
# Send audio response to client
emit('audio_response', {
'audio': audio_b64,
'text': bot_response
}, room=session_id)
else:
# Send text-only response if audio generation isn't available
emit('text_response', {'text': bot_response}, room=session_id)
# Add text-only bot response to conversation history
bot_segment = Segment(
text=bot_response,
speaker=1, # Bot is speaker 1
audio=torch.zeros(1) # Placeholder empty audio
)
context['segments'].append(bot_segment)
# Begin streaming audio response
threading.Thread(
target=stream_ai_response,
args=(ai_response, session_id),
daemon=True
).start()
except Exception as e:
print(f"Error processing speech: {e}")
emit('error', {'message': f'Error processing speech: {str(e)}'}, room=session_id)
print(f"Error processing utterance: {e}")
emit('error', {'message': f'Error: {str(e)}'}, room=session_id)
finally:
# Cleanup temp file
if os.path.exists(temp_audio_path):
os.remove(temp_audio_path)
# Clear processing flag
if session_id in user_sessions:
session['is_processing'] = False
def transcribe_with_whisper(audio_path):
"""Transcribe audio using Faster-Whisper"""
@@ -250,14 +490,13 @@ def transcribe_with_whisper(audio_path):
# Collect all text from segments
user_text = ""
for segment in segments:
segment_text = segment.text.strip()
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment_text}")
user_text += segment_text + " "
user_text += segment.text.strip() + " "
return user_text.strip()
def transcribe_with_google(audio_path):
def transcribe_fallback(audio_path):
"""Fallback transcription using Google's speech recognition"""
try:
import speech_recognition as sr
recognizer = sr.Recognizer()
@@ -269,28 +508,40 @@ def transcribe_with_google(audio_path):
except sr.UnknownValueError:
return ""
except sr.RequestError:
# If Google API fails, try a basic energy-based VAD approach
# This is a very basic fallback and won't give good results
return "[Speech detected but transcription failed]"
return "[Speech recognition service unavailable]"
except ImportError:
return "[Speech recognition not available]"
def generate_ai_response(user_text, session_id):
"""Generate text response using available LLM"""
if session_id not in user_sessions:
return "I'm sorry, your session has expired."
session = user_sessions[session_id]
def generate_llm_response(user_text, conversation_segments):
"""Generate text response using available model"""
if llm_model is not None and llm_tokenizer is not None:
# Format conversation history for the LLM
conversation_history = ""
for segment in conversation_segments[-5:]: # Use last 5 utterances for context
speaker_name = "User" if segment.speaker == 0 else "Assistant"
conversation_history += f"{speaker_name}: {segment.text}\n"
prompt = "You are a helpful, friendly voice assistant. Keep your responses brief and conversational.\n\n"
# Add the current user query
conversation_history += f"User: {user_text}\nAssistant:"
# Add recent conversation history (last 6 turns maximum)
for entry in session['conversation_history'][-6:]:
if entry['role'] == 'user':
prompt += f"User: {entry['text']}\n"
else:
prompt += f"Assistant: {entry['text']}\n"
# Add current query if not already in history
if not session['conversation_history'] or session['conversation_history'][-1]['role'] != 'user':
prompt += f"User: {user_text}\n"
prompt += "Assistant: "
try:
# Generate response
inputs = llm_tokenizer(conversation_history, return_tensors="pt").to(device)
inputs = llm_tokenizer(prompt, return_tensors="pt").to(device)
output = llm_model.generate(
inputs.input_ids,
max_new_tokens=150,
max_new_tokens=100, # Keep responses shorter for voice
temperature=0.7,
top_p=0.9,
do_sample=True
@@ -298,40 +549,48 @@ def generate_llm_response(user_text, conversation_segments):
response = llm_tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response.strip()
except Exception as e:
print(f"Error generating response with LLM: {e}")
print(f"Error generating LLM response: {e}")
return fallback_response(user_text)
else:
return fallback_response(user_text)
def fallback_response(user_text):
"""Generate a simple fallback response when LLM is not available"""
# Simple rule-based responses
"""Generate simple fallback responses when LLM is unavailable"""
user_text_lower = user_text.lower()
if "hello" in user_text_lower or "hi" in user_text_lower:
return "Hello! I'm a simple fallback assistant. The main language model couldn't be loaded, so I have limited capabilities."
return "Hello! How can I help you today?"
elif "how are you" in user_text_lower:
return "I'm functioning within my limited capabilities. How can I assist you today?"
return "I'm doing well, thanks for asking! How about you?"
elif "thank" in user_text_lower:
return "You're welcome! Let me know if there's anything else I can help with."
return "You're welcome! Happy to help."
elif "bye" in user_text_lower or "goodbye" in user_text_lower:
return "Goodbye! Have a great day!"
elif any(q in user_text_lower for q in ["what", "who", "where", "when", "why", "how"]):
return "I'm running in fallback mode and can't answer complex questions. Please try again when the main language model is available."
return "That's an interesting question. I wish I could provide a better answer in my current fallback mode."
else:
return "I understand you said something about that. Unfortunately, I'm running in fallback mode with limited capabilities. Please try again later when the main model is available."
return "I see. Tell me more about that."
def stream_ai_response(text, session_id):
"""Generate and stream audio response in real-time chunks"""
if session_id not in user_sessions:
return
session = user_sessions[session_id]
def generate_audio_response(text, conversation_segments):
"""Generate audio response using CSM"""
try:
# Use the last few conversation segments as context
context_segments = conversation_segments[-4:] if len(conversation_segments) > 4 else conversation_segments
# Signal start of AI speech
emit('ai_speech_start', room=session_id)
# Use the last few conversation segments as context (up to 4)
context_segments = session['segments'][-4:] if len(session['segments']) > 4 else session['segments']
# Generate audio for bot response
audio = csm_generator.generate(
@@ -343,11 +602,77 @@ def generate_audio_response(text, conversation_segments):
topk=50
)
return audio
# Create and store bot segment
bot_segment = Segment(
text=text,
speaker=1,
audio=audio
)
if session_id in user_sessions:
session['segments'].append(bot_segment)
# Stream audio in small chunks for more responsive playback
chunk_size = AUDIO_CHUNK_SIZE # Size defined in constants
for i in range(0, len(audio), chunk_size):
# Check if we should stop (user interrupted)
if session_id not in user_sessions or session['should_interrupt_ai']:
print("AI speech interrupted")
break
# Get next chunk
chunk = audio[i:i+chunk_size]
# Convert audio chunk to base64 for streaming
audio_bytes = io.BytesIO()
torchaudio.save(audio_bytes, chunk.unsqueeze(0).cpu(), csm_generator.sample_rate, format="wav")
audio_bytes.seek(0)
audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
# Send chunk to client
socketio.emit('ai_speech_chunk', {
'audio': audio_b64,
'is_last': i + chunk_size >= len(audio)
}, room=session_id)
# Small sleep for more natural pacing
time.sleep(0.06) # Slight delay for smoother playback
# Signal end of AI speech
if session_id in user_sessions:
session['is_ai_speaking'] = False
session['is_turn_active'] = False # End conversation turn
socketio.emit('ai_speech_end', room=session_id)
except Exception as e:
print(f"Error generating audio: {e}")
# Return silence as fallback
return torch.zeros(csm_generator.sample_rate * 3) # 3 seconds of silence
print(f"Error streaming AI response: {e}")
if session_id in user_sessions:
session['is_ai_speaking'] = False
session['is_turn_active'] = False
socketio.emit('error', {'message': f'Error generating audio: {str(e)}'}, room=session_id)
socketio.emit('ai_speech_end', room=session_id)
@socketio.on('interrupt_ai')
def handle_interrupt():
"""Handle explicit AI interruption request from client"""
session_id = request.sid
if session_id in user_sessions:
user_sessions[session_id]['should_interrupt_ai'] = True
emit('ai_interrupted', room=session_id)
@socketio.on('get_config')
def handle_get_config():
"""Send configuration to client"""
session_id = request.sid
if session_id in user_sessions:
emit('config', {
'client_sample_rate': CLIENT_SAMPLE_RATE,
'server_sample_rate': getattr(csm_generator, 'sample_rate', 24000) if csm_generator else 24000,
'whisper_available': whisper_model is not None,
'csm_available': csm_generator is not None,
'ice_servers': ICE_SERVERS
})
if __name__ == '__main__':
# Ensure the existing index.html file is in the correct location
@@ -357,9 +682,8 @@ if __name__ == '__main__':
if os.path.exists('index.html') and not os.path.exists('templates/index.html'):
os.rename('index.html', 'templates/index.html')
# Load models asynchronously before starting the server
print("Starting CPU-only model loading...")
# In a production environment, you could load models in a separate thread
# Load models before starting the server
print("Starting model loading...")
load_models()
# Start the server

560
Backend/voice-chat.js Normal file
View File

@@ -0,0 +1,560 @@
document.addEventListener('DOMContentLoaded', () => {
// DOM Elements
const startButton = document.getElementById('start-button');
const interruptButton = document.getElementById('interrupt-button');
const conversationDiv = document.getElementById('conversation');
const connectionDot = document.getElementById('connection-dot');
const connectionStatus = document.getElementById('connection-status');
const whisperStatus = document.getElementById('whisper-status');
const csmStatus = document.getElementById('csm-status');
const llmStatus = document.getElementById('llm-status');
const webrtcStatus = document.getElementById('webrtc-status');
const micAnimation = document.getElementById('mic-animation');
const loadingDiv = document.getElementById('loading');
const loadingText = document.getElementById('loading-text');
// State variables
let socket;
let isConnected = false;
let isListening = false;
let isAiSpeaking = false;
let audioContext;
let mediaStream;
let audioRecorder;
let audioProcessor;
const audioChunks = [];
// WebRTC variables
let peerConnection;
let dataChannel;
let hasActiveConnection = false;
// Audio playback
let audioQueue = [];
let isPlaying = false;
// Configuration variables
let serverSampleRate = 24000;
let clientSampleRate = 44100;
let iceServers = [];
// Initialize the application
initApp();
// Main initialization function
function initApp() {
updateConnectionStatus('connecting');
setupSocketConnection();
setupEventListeners();
}
// Set up Socket.IO connection with server
function setupSocketConnection() {
socket = io();
socket.on('connect', () => {
console.log('Connected to server');
updateConnectionStatus('connected');
isConnected = true;
});
socket.on('disconnect', () => {
console.log('Disconnected from server');
updateConnectionStatus('disconnected');
isConnected = false;
cleanupAudio();
cleanupWebRTC();
});
socket.on('session_ready', (data) => {
console.log('Session ready:', data);
updateModelStatus(data);
clientSampleRate = data.client_sample_rate;
serverSampleRate = data.server_sample_rate;
iceServers = data.ice_servers;
// Initialize WebRTC if models are available
if (data.whisper_available && data.llm_available) {
initializeWebRTC();
}
});
socket.on('ready_for_speech', (data) => {
console.log('Ready for speech:', data);
startButton.disabled = false;
addInfoMessage('Ready for conversation. Click "Start Listening" to begin.');
});
socket.on('webrtc_signal', (data) => {
handleWebRTCSignal(data);
});
socket.on('transcription', (data) => {
console.log('Transcription:', data);
addUserMessage(data.text);
loadingDiv.style.display = 'none';
});
socket.on('ai_response_text', (data) => {
console.log('AI response text:', data);
addAIMessage(data.text);
loadingDiv.style.display = 'none';
});
socket.on('ai_speech_start', () => {
console.log('AI started speaking');
isAiSpeaking = true;
interruptButton.disabled = false;
});
socket.on('ai_speech_chunk', (data) => {
console.log('Received AI speech chunk');
playAudioChunk(data.audio, data.is_last);
});
socket.on('ai_speech_end', () => {
console.log('AI stopped speaking');
isAiSpeaking = false;
interruptButton.disabled = true;
});
socket.on('user_speech_start', () => {
console.log('User speech detected');
showSpeakingIndicator(true);
});
socket.on('processing_speech', () => {
console.log('Processing speech');
showSpeakingIndicator(false);
showLoadingIndicator('Processing your speech...');
});
socket.on('no_speech_detected', () => {
console.log('No speech detected');
hideLoadingIndicator();
addInfoMessage('No speech detected. Please try again.');
});
socket.on('ai_interrupted', () => {
console.log('AI interrupted');
clearAudioQueue();
isAiSpeaking = false;
interruptButton.disabled = true;
});
socket.on('ai_interrupted_by_user', () => {
console.log('AI interrupted by user');
clearAudioQueue();
isAiSpeaking = false;
interruptButton.disabled = true;
addInfoMessage('AI interrupted by your speech');
});
socket.on('error', (data) => {
console.error('Server error:', data);
hideLoadingIndicator();
addInfoMessage(`Error: ${data.message}`);
});
}
// Set up UI event listeners
function setupEventListeners() {
startButton.addEventListener('click', toggleListening);
interruptButton.addEventListener('click', interruptAI);
}
// Update UI connection status
function updateConnectionStatus(status) {
connectionDot.className = 'status-dot ' + status;
switch (status) {
case 'connected':
connectionStatus.textContent = 'Connected';
break;
case 'connecting':
connectionStatus.textContent = 'Connecting...';
break;
case 'disconnected':
connectionStatus.textContent = 'Disconnected';
startButton.disabled = true;
interruptButton.disabled = true;
break;
}
}
// Update model status indicators
function updateModelStatus(data) {
whisperStatus.textContent = data.whisper_available ? 'Available' : 'Not Available';
whisperStatus.style.color = data.whisper_available ? 'green' : 'red';
csmStatus.textContent = data.csm_available ? 'Available' : 'Not Available';
csmStatus.style.color = data.csm_available ? 'green' : 'red';
llmStatus.textContent = data.llm_available ? 'Available' : 'Not Available';
llmStatus.style.color = data.llm_available ? 'green' : 'red';
}
// Initialize WebRTC connection
function initializeWebRTC() {
if (!isConnected) return;
const configuration = {
iceServers: iceServers
};
peerConnection = new RTCPeerConnection(configuration);
// Create data channel for WebRTC communication
dataChannel = peerConnection.createDataChannel('audioData', {
ordered: true
});
dataChannel.onopen = () => {
console.log('WebRTC data channel open');
hasActiveConnection = true;
webrtcStatus.textContent = 'Connected';
webrtcStatus.style.color = 'green';
socket.emit('webrtc_connected', { status: 'connected' });
};
dataChannel.onclose = () => {
console.log('WebRTC data channel closed');
hasActiveConnection = false;
webrtcStatus.textContent = 'Disconnected';
webrtcStatus.style.color = 'red';
};
// Handle ICE candidates
peerConnection.onicecandidate = (event) => {
if (event.candidate) {
socket.emit('webrtc_signal', {
type: 'ice_candidate',
candidate: event.candidate
});
}
};
// Log ICE connection state changes
peerConnection.oniceconnectionstatechange = () => {
console.log('ICE connection state:', peerConnection.iceConnectionState);
};
// Create offer
peerConnection.createOffer()
.then(offer => peerConnection.setLocalDescription(offer))
.then(() => {
socket.emit('webrtc_signal', {
type: 'offer',
sdp: peerConnection.localDescription
});
})
.catch(error => {
console.error('Error creating WebRTC offer:', error);
webrtcStatus.textContent = 'Failed to Connect';
webrtcStatus.style.color = 'red';
});
}
// Handle WebRTC signals from the server
function handleWebRTCSignal(data) {
if (!peerConnection) return;
if (data.type === 'answer') {
peerConnection.setRemoteDescription(new RTCSessionDescription(data.sdp))
.catch(error => console.error('Error setting remote description:', error));
}
else if (data.type === 'ice_candidate') {
peerConnection.addIceCandidate(new RTCIceCandidate(data.candidate))
.catch(error => console.error('Error adding ICE candidate:', error));
}
}
// Clean up WebRTC connection
function cleanupWebRTC() {
if (dataChannel) {
dataChannel.close();
}
if (peerConnection) {
peerConnection.close();
}
dataChannel = null;
peerConnection = null;
hasActiveConnection = false;
webrtcStatus.textContent = 'Not Connected';
webrtcStatus.style.color = 'red';
}
// Toggle audio listening
function toggleListening() {
if (isListening) {
stopListening();
} else {
startListening();
}
}
// Start listening for audio
async function startListening() {
if (!isConnected) return;
try {
await initAudio();
isListening = true;
startButton.textContent = 'Stop Listening';
startButton.innerHTML = `
<svg class="button-icon" viewBox="0 0 24 24" fill="white">
<path d="M6 6h12v12H6z"></path>
</svg>
Stop Listening
`;
} catch (error) {
console.error('Error starting audio:', error);
addInfoMessage('Error accessing microphone. Please check permissions.');
}
}
// Stop listening for audio
function stopListening() {
cleanupAudio();
isListening = false;
startButton.innerHTML = `
<svg class="button-icon" viewBox="0 0 24 24" fill="white">
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm5.91-3c-.49 0-.9.36-.98.85C16.52 14.2 14.47 16 12 16s-4.52-1.8-4.93-4.15c-.08-.49-.49-.85-.98-.85-.61 0-1.09.54-1 1.14.49 3 2.89 5.35 5.91 5.78V20c0 .55.45 1 1 1s1-.45 1-1v-2.08c3.02-.43 5.42-2.78 5.91-5.78.1-.6-.39-1.14-1-1.14z"></path>
</svg>
Start Listening
`;
showSpeakingIndicator(false);
}
// Initialize audio capture
async function initAudio() {
// Request microphone access
mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: clientSampleRate,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true
}
});
// Initialize AudioContext
audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: clientSampleRate
});
// Create audio source from stream
const source = audioContext.createMediaStreamSource(mediaStream);
// Create ScriptProcessor for audio processing
const bufferSize = 4096;
audioProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
// Process audio data
audioProcessor.onaudioprocess = (event) => {
if (!isListening || isAiSpeaking) return;
const input = event.inputBuffer.getChannelData(0);
const audioData = convertFloat32ToInt16(input);
sendAudioChunk(audioData);
};
// Connect the nodes
source.connect(audioProcessor);
audioProcessor.connect(audioContext.destination);
}
// Clean up audio resources
function cleanupAudio() {
if (audioProcessor) {
audioProcessor.disconnect();
audioProcessor = null;
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
}
if (audioContext && audioContext.state !== 'closed') {
audioContext.close().catch(error => console.error('Error closing AudioContext:', error));
}
audioChunks.length = 0;
}
// Convert Float32Array to Int16Array for sending to server
function convertFloat32ToInt16(float32Array) {
const int16Array = new Int16Array(float32Array.length);
for (let i = 0; i < float32Array.length; i++) {
// Convert float [-1.0, 1.0] to int16 [-32768, 32767]
int16Array[i] = Math.max(-32768, Math.min(32767, Math.floor(float32Array[i] * 32768)));
}
return int16Array;
}
// Send audio chunk to server
function sendAudioChunk(audioData) {
if (!isConnected || !isListening) return;
// Convert to base64 for transmission
const base64Audio = arrayBufferToBase64(audioData.buffer);
// Send via Socket.IO (could use WebRTC's DataChannel for lower latency in production)
socket.emit('audio_stream', { audio: base64Audio });
}
// Play audio chunk received from server
function playAudioChunk(base64Audio, isLast) {
const audioData = base64ToArrayBuffer(base64Audio);
// Add to queue
audioQueue.push({
data: audioData,
isLast: isLast
});
// Start playing if not already playing
if (!isPlaying) {
playNextAudioChunk();
}
}
// Play the next audio chunk in the queue
function playNextAudioChunk() {
if (audioQueue.length === 0) {
isPlaying = false;
return;
}
isPlaying = true;
const chunk = audioQueue.shift();
try {
// Create audio context if needed
if (!audioContext || audioContext.state === 'closed') {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
}
// Resume audio context if suspended
if (audioContext.state === 'suspended') {
audioContext.resume();
}
// Decode the WAV data
audioContext.decodeAudioData(chunk.data, (buffer) => {
const source = audioContext.createBufferSource();
source.buffer = buffer;
source.connect(audioContext.destination);
// When playback ends, play the next chunk
source.onended = () => {
playNextAudioChunk();
};
source.start(0);
// If it's the last chunk, update UI
if (chunk.isLast) {
setTimeout(() => {
isAiSpeaking = false;
interruptButton.disabled = true;
}, buffer.duration * 1000);
}
}, (error) => {
console.error('Error decoding audio data:', error);
playNextAudioChunk(); // Skip this chunk and try the next
});
} catch (error) {
console.error('Error playing audio chunk:', error);
playNextAudioChunk(); // Try the next chunk
}
}
// Clear the audio queue (used when interrupting)
function clearAudioQueue() {
audioQueue.length = 0;
isPlaying = false;
// Stop any currently playing audio
if (audioContext) {
audioContext.suspend();
}
}
// Send interrupt signal to server
function interruptAI() {
if (!isConnected || !isAiSpeaking) return;
socket.emit('interrupt_ai');
clearAudioQueue();
}
// Convert ArrayBuffer to Base64 string
function arrayBufferToBase64(buffer) {
const binary = new Uint8Array(buffer);
let base64 = '';
const len = binary.byteLength;
for (let i = 0; i < len; i++) {
base64 += String.fromCharCode(binary[i]);
}
return window.btoa(base64);
}
// Convert Base64 string to ArrayBuffer
function base64ToArrayBuffer(base64) {
const binaryString = window.atob(base64);
const len = binaryString.length;
const bytes = new Uint8Array(len);
for (let i = 0; i < len; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
return bytes.buffer;
}
// Add user message to conversation
function addUserMessage(text) {
const messageDiv = document.createElement('div');
messageDiv.className = 'message user-message';
messageDiv.textContent = text;
conversationDiv.appendChild(messageDiv);
conversationDiv.scrollTop = conversationDiv.scrollHeight;
}
// Add AI message to conversation
function addAIMessage(text) {
const messageDiv = document.createElement('div');
messageDiv.className = 'message ai-message';
messageDiv.textContent = text;
conversationDiv.appendChild(messageDiv);
conversationDiv.scrollTop = conversationDiv.scrollHeight;
}
// Add info message to conversation
function addInfoMessage(text) {
const messageDiv = document.createElement('div');
messageDiv.className = 'info-message';
messageDiv.textContent = text;
conversationDiv.appendChild(messageDiv);
conversationDiv.scrollTop = conversationDiv.scrollHeight;
}
// Show/hide speaking indicator
function showSpeakingIndicator(show) {
micAnimation.style.display = show ? 'flex' : 'none';
}
// Show loading indicator
function showLoadingIndicator(text) {
loadingText.textContent = text || 'Processing...';
loadingDiv.style.display = 'block';
}
// Hide loading indicator
function hideLoadingIndicator() {
loadingDiv.style.display = 'none';
}
});

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

View File

@@ -13,8 +13,8 @@ const geistMono = Geist_Mono({
});
export const metadata: Metadata = {
title: "Create Next App",
description: "Generated by create next app",
title: "Fauxcall",
description: "Fauxcall is a fake call app that helps you get out of awkward situations.",
};
export default function RootLayout({

25
React/src/app/manifest.ts Normal file
View File

@@ -0,0 +1,25 @@
import type { MetadataRoute } from 'next'
export default function manifest(): MetadataRoute.Manifest {
return {
name: 'Fauxcall',
short_name: 'Fauxcall',
description: 'A fake call app that helps you get out of awkward and dangerous situations.',
start_url: '/',
display: 'standalone',
background_color: '#ffffff',
theme_color: '#000000',
icons: [
{
src: '/icon-192x192.png',
sizes: '192x192',
type: 'image/png',
},
{
src: '/icon-512x512.png',
sizes: '512x512',
type: 'image/png',
},
],
}
}

View File

@@ -4,7 +4,7 @@ import { useRouter } from "next/navigation";
import './styles.css';
export default function Home() {
const [contacts, setContacts] = useState<string[]>([]);
const [contacts, setContacts] = useState<string[]>([""]);
const [codeword, setCodeword] = useState("");
const [session, setSession] = useState<any>(null);
const [loading, setLoading] = useState(true);
@@ -26,6 +26,16 @@ export default function Home() {
});
}, []);
const handleInputChange = (index: number, value: string) => {
const updatedContacts = [...contacts];
updatedContacts[index] = value; // Update the specific input value
setContacts(updatedContacts);
};
const addContactInput = () => {
setContacts([...contacts, ""]); // Add a new empty input
};
function saveToDB() {
alert("Saving contacts...");
const contactInputs = document.querySelectorAll(
@@ -144,27 +154,20 @@ export default function Home() {
className="space-y-5 flex flex-col gap-[32px] row-start-2 items-center sm:items-start"
onSubmit={(e) => e.preventDefault()}
>
{contacts.map((contact, index) => (
<input
key={index}
type="text"
value={contacts}
onChange={(e) => setContacts(e.target.value.split(","))}
placeholder="Write down an emergency contact"
value={contact}
onChange={(e) => handleInputChange(index, e.target.value)}
placeholder={`Contact ${index + 1}`}
className="border border-gray-300 rounded-md p-2"
/>
))}
<button
onClick={() => {
alert("Adding contact...");
let elem = document.getElementsByClassName(
"text-input"
)[0] as HTMLElement;
console.log("Element:", elem);
let d = elem.cloneNode(true) as HTMLElement;
document.getElementById("Contacts")?.appendChild(d);
}}
className="bg-emerald-500 text-fuchsia-300"
onClick={addContactInput}
className="bg-emerald-500 text-white
font-semibold font-lg rounded-md p-2"
type="button"
>
Add