Files
HooHacks-12/Backend/index.html

587 lines
21 KiB
HTML

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Sesame AI Voice Chat</title>
<style>
body {
font-family: 'Arial', sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
}
.conversation {
border: 1px solid #ccc;
border-radius: 8px;
padding: 15px;
height: 300px;
overflow-y: auto;
margin-bottom: 15px;
}
.message {
margin-bottom: 10px;
padding: 8px;
border-radius: 8px;
}
.user {
background-color: #e3f2fd;
text-align: right;
}
.ai {
background-color: #f1f1f1;
}
.controls {
display: flex;
flex-direction: column;
gap: 10px;
}
.input-row {
display: flex;
gap: 10px;
}
input[type="text"] {
flex-grow: 1;
padding: 8px;
border-radius: 4px;
border: 1px solid #ccc;
}
button {
padding: 8px 16px;
border-radius: 4px;
border: none;
background-color: #4CAF50;
color: white;
cursor: pointer;
}
button:hover {
background-color: #45a049;
}
.recording {
background-color: #f44336;
}
select {
padding: 8px;
border-radius: 4px;
border: 1px solid #ccc;
}
</style>
</head>
<body>
<h1>Sesame AI Voice Chat</h1>
<div class="conversation" id="conversation"></div>
<div class="controls">
<div class="input-row">
<input type="text" id="textInput" placeholder="Type your message...">
<select id="speakerSelect">
<option value="0">Speaker 0</option>
<option value="1">Speaker 1</option>
</select>
<button id="sendText">Send</button>
</div>
<div class="input-row">
<button id="recordAudio">Record Audio</button>
<button id="clearContext">Clear Context</button>
</div>
</div>
<script>
let ws;
let mediaRecorder;
let audioChunks = [];
let isRecording = false;
let audioContext;
let streamProcessor;
let isStreaming = false;
let streamButton;
let isSpeaking = false;
let silenceTimer = null;
let energyWindow = [];
const ENERGY_WINDOW_SIZE = 10;
const CLIENT_SILENCE_THRESHOLD = 0.01;
const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second
// DOM elements
const conversationEl = document.getElementById('conversation');
const textInputEl = document.getElementById('textInput');
const speakerSelectEl = document.getElementById('speakerSelect');
const sendTextBtn = document.getElementById('sendText');
const recordAudioBtn = document.getElementById('recordAudio');
const clearContextBtn = document.getElementById('clearContext');
// Add streaming button to the input row
window.addEventListener('load', () => {
const inputRow = document.querySelector('.input-row:nth-child(2)');
streamButton = document.createElement('button');
streamButton.id = 'streamAudio';
streamButton.textContent = 'Start Streaming';
streamButton.addEventListener('click', toggleStreaming);
inputRow.appendChild(streamButton);
connectWebSocket();
setupRecording();
setupAudioContext();
});
// Setup audio context for streaming
function setupAudioContext() {
try {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
console.log('Audio context setup completed');
} catch (err) {
console.error('Error setting up audio context:', err);
addSystemMessage(`Audio context error: ${err.message}`);
}
}
// Toggle audio streaming
async function toggleStreaming() {
if (isStreaming) {
stopStreaming();
} else {
startStreaming();
}
}
// Start audio streaming with silence detection
async function startStreaming() {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const speaker = parseInt(speakerSelectEl.value);
isStreaming = true;
isSpeaking = false;
energyWindow = [];
streamButton.textContent = 'Speaking...';
streamButton.classList.add('recording');
// Create audio processor node
const source = audioContext.createMediaStreamSource(stream);
streamProcessor = audioContext.createScriptProcessor(4096, 1, 1);
// Process and send audio data
streamProcessor.onaudioprocess = function(e) {
const audioData = e.inputBuffer.getChannelData(0);
// Calculate energy (volume) for silence detection
const energy = calculateAudioEnergy(audioData);
updateEnergyWindow(energy);
// Check if currently silent
const avgEnergy = calculateAverageEnergy();
const isSilent = avgEnergy < CLIENT_SILENCE_THRESHOLD;
// Handle silence/speech transitions for visual feedback
handleSpeechState(isSilent);
// Continue processing audio regardless of silence state
const downsampled = downsampleBuffer(audioData, audioContext.sampleRate, 24000);
sendAudioChunk(downsampled, speaker);
};
// Connect the nodes
source.connect(streamProcessor);
streamProcessor.connect(audioContext.destination);
addSystemMessage('Audio streaming started - speak naturally and pause when finished');
} catch (err) {
console.error('Error starting audio stream:', err);
addSystemMessage(`Streaming error: ${err.message}`);
isStreaming = false;
streamButton.textContent = 'Start Streaming';
streamButton.classList.remove('recording');
}
}
// Calculate audio energy (volume)
function calculateAudioEnergy(buffer) {
let sum = 0;
for (let i = 0; i < buffer.length; i++) {
sum += Math.abs(buffer[i]);
}
return sum / buffer.length;
}
// Update the sliding energy window
function updateEnergyWindow(energy) {
energyWindow.push(energy);
if (energyWindow.length > ENERGY_WINDOW_SIZE) {
energyWindow.shift();
}
}
// Calculate average energy from the window
function calculateAverageEnergy() {
if (energyWindow.length === 0) return 0;
return energyWindow.reduce((sum, val) => sum + val, 0) / energyWindow.length;
}
// Handle speech state changes and visual feedback
function handleSpeechState(isSilent) {
if (isSpeaking && isSilent) {
// Transition from speaking to silence
if (!silenceTimer) {
silenceTimer = setTimeout(() => {
// Silence persisted long enough
streamButton.textContent = 'Processing...';
streamButton.style.backgroundColor = '#FFA500'; // Orange
addSystemMessage('Detected pause in speech, processing response...');
}, CLIENT_SILENCE_DURATION_MS);
}
} else if (!isSpeaking && !isSilent) {
// Transition from silence to speaking
isSpeaking = true;
streamButton.textContent = 'Speaking...';
streamButton.style.backgroundColor = '#f44336'; // Red
// Clear any pending silence timer
if (silenceTimer) {
clearTimeout(silenceTimer);
silenceTimer = null;
}
} else if (isSpeaking && !isSilent) {
// Still speaking, reset any silence timer
if (silenceTimer) {
clearTimeout(silenceTimer);
silenceTimer = null;
}
}
// Update speaking state
if (!isSilent) {
isSpeaking = true;
}
}
// Send audio chunk to server
function sendAudioChunk(audioData, speaker) {
const wavData = createWavBlob(audioData, 24000);
const reader = new FileReader();
reader.onloadend = function() {
const base64data = reader.result;
// Send to server
ws.send(JSON.stringify({
action: 'stream_audio',
speaker: speaker,
audio: base64data
}));
};
reader.readAsDataURL(wavData);
}
// Stop audio streaming
function stopStreaming() {
if (streamProcessor) {
streamProcessor.disconnect();
streamProcessor = null;
}
// Clear any pending silence timer
if (silenceTimer) {
clearTimeout(silenceTimer);
silenceTimer = null;
}
isStreaming = false;
isSpeaking = false;
energyWindow = [];
streamButton.textContent = 'Start Streaming';
streamButton.classList.remove('recording');
streamButton.style.backgroundColor = ''; // Reset to default
addSystemMessage('Audio streaming stopped');
// Send stop streaming signal to server
ws.send(JSON.stringify({
action: 'stop_streaming',
speaker: parseInt(speakerSelectEl.value)
}));
}
// Downsample audio buffer to target sample rate
function downsampleBuffer(buffer, sampleRate, targetSampleRate) {
if (targetSampleRate === sampleRate) {
return buffer;
}
const sampleRateRatio = sampleRate / targetSampleRate;
const newLength = Math.round(buffer.length / sampleRateRatio);
const result = new Float32Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
while (offsetResult < result.length) {
const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
let accum = 0, count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = accum / count;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result;
}
// Create WAV blob from Float32Array
function createWavBlob(samples, sampleRate) {
const buffer = new ArrayBuffer(44 + samples.length * 2);
const view = new DataView(buffer);
// RIFF chunk descriptor
writeString(view, 0, 'RIFF');
view.setUint32(4, 36 + samples.length * 2, true);
writeString(view, 8, 'WAVE');
// fmt sub-chunk
writeString(view, 12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, 1, true); // PCM format
view.setUint16(22, 1, true); // Mono channel
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
// data sub-chunk
writeString(view, 36, 'data');
view.setUint32(40, samples.length * 2, true);
// Write the PCM samples
const volume = 0.5;
for (let i = 0; i < samples.length; i++) {
const sample = Math.max(-1, Math.min(1, samples[i]));
view.setInt16(44 + i * 2, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true);
}
return new Blob([buffer], { type: 'audio/wav' });
}
function writeString(view, offset, string) {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
}
// Connect to WebSocket
function connectWebSocket() {
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsUrl = `${wsProtocol}//${window.location.hostname}:8000/ws`;
ws = new WebSocket(wsUrl);
ws.onopen = () => {
console.log('WebSocket connected');
addSystemMessage('Connected to server');
};
ws.onmessage = (event) => {
const response = JSON.parse(event.data);
console.log('Received:', response);
if (response.type === 'audio_response') {
// Play audio response
const audio = new Audio(response.audio);
audio.play();
// Add message to conversation
addAIMessage(response.audio);
// Reset the streaming button if we're still in streaming mode
if (isStreaming) {
streamButton.textContent = 'Speaking...';
streamButton.style.backgroundColor = '#f44336'; // Back to red
isSpeaking = false; // Reset speaking state
}
} else if (response.type === 'error') {
addSystemMessage(`Error: ${response.message}`);
} else if (response.type === 'context_updated') {
addSystemMessage(response.message);
} else if (response.type === 'streaming_status') {
addSystemMessage(`Streaming ${response.status}`);
}
};
ws.onclose = () => {
console.log('WebSocket disconnected');
addSystemMessage('Disconnected from server. Reconnecting...');
setTimeout(connectWebSocket, 3000);
};
ws.onerror = (error) => {
console.error('WebSocket error:', error);
addSystemMessage('Connection error');
};
}
// Add message to conversation
function addUserMessage(text) {
const messageEl = document.createElement('div');
messageEl.classList.add('message', 'user');
messageEl.textContent = text;
conversationEl.appendChild(messageEl);
conversationEl.scrollTop = conversationEl.scrollHeight;
}
function addAIMessage(audioSrc) {
const messageEl = document.createElement('div');
messageEl.classList.add('message', 'ai');
const audioEl = document.createElement('audio');
audioEl.controls = true;
audioEl.src = audioSrc;
messageEl.appendChild(audioEl);
conversationEl.appendChild(messageEl);
conversationEl.scrollTop = conversationEl.scrollHeight;
}
function addSystemMessage(text) {
const messageEl = document.createElement('div');
messageEl.classList.add('message');
messageEl.textContent = text;
conversationEl.appendChild(messageEl);
conversationEl.scrollTop = conversationEl.scrollHeight;
}
// Send text for audio generation
function sendTextForGeneration() {
const text = textInputEl.value.trim();
const speaker = parseInt(speakerSelectEl.value);
if (!text) return;
addUserMessage(text);
textInputEl.value = '';
const request = {
action: 'generate',
text: text,
speaker: speaker
};
ws.send(JSON.stringify(request));
}
// Audio recording functions
async function setupRecording() {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunks.push(event.data);
}
};
mediaRecorder.onstop = async () => {
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
const audioUrl = URL.createObjectURL(audioBlob);
// Add audio to conversation
addUserMessage('Recorded audio:');
const messageEl = document.createElement('div');
messageEl.classList.add('message', 'user');
const audioEl = document.createElement('audio');
audioEl.controls = true;
audioEl.src = audioUrl;
messageEl.appendChild(audioEl);
conversationEl.appendChild(messageEl);
// Convert to base64
const reader = new FileReader();
reader.readAsDataURL(audioBlob);
reader.onloadend = () => {
const base64Audio = reader.result;
const text = textInputEl.value.trim() || "Recorded audio";
const speaker = parseInt(speakerSelectEl.value);
// Send to server
const request = {
action: 'add_to_context',
text: text,
speaker: speaker,
audio: base64Audio
};
ws.send(JSON.stringify(request));
textInputEl.value = '';
};
audioChunks = [];
recordAudioBtn.textContent = 'Record Audio';
recordAudioBtn.classList.remove('recording');
};
console.log('Recording setup completed');
return true;
} catch (err) {
console.error('Error setting up recording:', err);
addSystemMessage(`Microphone access error: ${err.message}`);
return false;
}
}
function toggleRecording() {
if (isRecording) {
mediaRecorder.stop();
isRecording = false;
} else {
if (!mediaRecorder) {
setupRecording().then(success => {
if (success) startRecording();
});
} else {
startRecording();
}
}
}
function startRecording() {
audioChunks = [];
mediaRecorder.start();
isRecording = true;
recordAudioBtn.textContent = 'Stop Recording';
recordAudioBtn.classList.add('recording');
}
// Event listeners
sendTextBtn.addEventListener('click', sendTextForGeneration);
textInputEl.addEventListener('keypress', (e) => {
if (e.key === 'Enter') sendTextForGeneration();
});
recordAudioBtn.addEventListener('click', toggleRecording);
clearContextBtn.addEventListener('click', () => {
ws.send(JSON.stringify({
action: 'clear_context'
}));
});
// Initialize
window.addEventListener('load', () => {
connectWebSocket();
setupRecording();
});
</script>
</body>
</html>