542 lines
19 KiB
HTML
542 lines
19 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Sesame AI Voice Chat</title>
|
|
<style>
|
|
body {
|
|
font-family: 'Arial', sans-serif;
|
|
max-width: 800px;
|
|
margin: 0 auto;
|
|
padding: 20px;
|
|
background-color: #f9f9f9;
|
|
}
|
|
.conversation {
|
|
border: 1px solid #ddd;
|
|
border-radius: 12px;
|
|
padding: 20px;
|
|
height: 400px;
|
|
overflow-y: auto;
|
|
margin-bottom: 20px;
|
|
background-color: white;
|
|
box-shadow: 0 2px 10px rgba(0,0,0,0.05);
|
|
}
|
|
.message {
|
|
margin-bottom: 15px;
|
|
padding: 12px;
|
|
border-radius: 12px;
|
|
max-width: 80%;
|
|
line-height: 1.4;
|
|
}
|
|
.user {
|
|
background-color: #e3f2fd;
|
|
text-align: right;
|
|
margin-left: auto;
|
|
border-bottom-right-radius: 4px;
|
|
}
|
|
.ai {
|
|
background-color: #f1f1f1;
|
|
margin-right: auto;
|
|
border-bottom-left-radius: 4px;
|
|
}
|
|
.system {
|
|
background-color: #f8f9fa;
|
|
font-style: italic;
|
|
text-align: center;
|
|
font-size: 0.9em;
|
|
color: #666;
|
|
padding: 8px;
|
|
margin: 10px auto;
|
|
max-width: 90%;
|
|
}
|
|
.controls {
|
|
display: flex;
|
|
gap: 15px;
|
|
justify-content: center;
|
|
align-items: center;
|
|
}
|
|
button {
|
|
padding: 12px 24px;
|
|
border-radius: 24px;
|
|
border: none;
|
|
background-color: #4CAF50;
|
|
color: white;
|
|
cursor: pointer;
|
|
font-weight: bold;
|
|
transition: all 0.2s ease;
|
|
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
|
}
|
|
button:hover {
|
|
background-color: #45a049;
|
|
box-shadow: 0 4px 8px rgba(0,0,0,0.15);
|
|
}
|
|
.recording {
|
|
background-color: #f44336;
|
|
animation: pulse 1.5s infinite;
|
|
}
|
|
.processing {
|
|
background-color: #FFA500;
|
|
}
|
|
select {
|
|
padding: 10px;
|
|
border-radius: 24px;
|
|
border: 1px solid #ddd;
|
|
background-color: white;
|
|
}
|
|
.transcript {
|
|
font-style: italic;
|
|
color: #666;
|
|
margin-top: 5px;
|
|
}
|
|
@keyframes pulse {
|
|
0% { opacity: 1; }
|
|
50% { opacity: 0.7; }
|
|
100% { opacity: 1; }
|
|
}
|
|
.status-indicator {
|
|
display: flex;
|
|
align-items: center;
|
|
justify-content: center;
|
|
margin-top: 10px;
|
|
gap: 5px;
|
|
}
|
|
.status-dot {
|
|
width: 10px;
|
|
height: 10px;
|
|
border-radius: 50%;
|
|
background-color: #ccc;
|
|
}
|
|
.status-dot.active {
|
|
background-color: #4CAF50;
|
|
}
|
|
.status-text {
|
|
font-size: 0.9em;
|
|
color: #666;
|
|
}
|
|
audio {
|
|
width: 100%;
|
|
margin-top: 5px;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>Sesame AI Voice Chat</h1>
|
|
<div class="conversation" id="conversation"></div>
|
|
|
|
<div class="controls">
|
|
<select id="speakerSelect">
|
|
<option value="0">Speaker 0</option>
|
|
<option value="1">Speaker 1</option>
|
|
</select>
|
|
<button id="streamButton">Start Conversation</button>
|
|
<button id="clearButton">Clear Chat</button>
|
|
</div>
|
|
|
|
<div class="status-indicator">
|
|
<div class="status-dot" id="statusDot"></div>
|
|
<div class="status-text" id="statusText">Not connected</div>
|
|
</div>
|
|
|
|
<script>
|
|
// Variables
|
|
let ws;
|
|
let audioContext;
|
|
let streamProcessor;
|
|
let isStreaming = false;
|
|
let isSpeaking = false;
|
|
let silenceTimer = null;
|
|
let energyWindow = [];
|
|
const ENERGY_WINDOW_SIZE = 10;
|
|
const CLIENT_SILENCE_THRESHOLD = 0.01;
|
|
const CLIENT_SILENCE_DURATION_MS = 1000; // 1 second
|
|
|
|
// DOM elements
|
|
const conversationEl = document.getElementById('conversation');
|
|
const speakerSelectEl = document.getElementById('speakerSelect');
|
|
const streamButton = document.getElementById('streamButton');
|
|
const clearButton = document.getElementById('clearButton');
|
|
const statusDot = document.getElementById('statusDot');
|
|
const statusText = document.getElementById('statusText');
|
|
|
|
// Initialize on page load
|
|
window.addEventListener('load', () => {
|
|
connectWebSocket();
|
|
setupAudioContext();
|
|
|
|
// Event listeners
|
|
streamButton.addEventListener('click', toggleStreaming);
|
|
clearButton.addEventListener('click', clearConversation);
|
|
});
|
|
|
|
// Setup audio context for streaming
|
|
function setupAudioContext() {
|
|
try {
|
|
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
|
console.log('Audio context setup completed');
|
|
} catch (err) {
|
|
console.error('Error setting up audio context:', err);
|
|
addSystemMessage(`Audio context error: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
// Connect to WebSocket server
|
|
function connectWebSocket() {
|
|
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
|
const wsUrl = `${wsProtocol}//${window.location.hostname}:8000/ws`;
|
|
|
|
ws = new WebSocket(wsUrl);
|
|
|
|
ws.onopen = () => {
|
|
console.log('WebSocket connected');
|
|
statusDot.classList.add('active');
|
|
statusText.textContent = 'Connected';
|
|
addSystemMessage('Connected to server');
|
|
};
|
|
|
|
ws.onmessage = (event) => {
|
|
const response = JSON.parse(event.data);
|
|
console.log('Received:', response);
|
|
|
|
if (response.type === 'audio_response') {
|
|
// Play audio response
|
|
const audio = new Audio(response.audio);
|
|
audio.play();
|
|
|
|
// Add message to conversation
|
|
addAIMessage(response.text || 'AI response', response.audio);
|
|
|
|
// Reset to speaking state after AI response
|
|
if (isStreaming) {
|
|
streamButton.textContent = 'Listening...';
|
|
streamButton.style.backgroundColor = '#f44336'; // Back to red
|
|
streamButton.classList.add('recording');
|
|
isSpeaking = false; // Reset speaking state
|
|
}
|
|
} else if (response.type === 'error') {
|
|
addSystemMessage(`Error: ${response.message}`);
|
|
} else if (response.type === 'context_updated') {
|
|
addSystemMessage(response.message);
|
|
} else if (response.type === 'streaming_status') {
|
|
addSystemMessage(`Streaming ${response.status}`);
|
|
} else if (response.type === 'transcription') {
|
|
addUserTranscription(response.text);
|
|
}
|
|
};
|
|
|
|
ws.onclose = () => {
|
|
console.log('WebSocket disconnected');
|
|
statusDot.classList.remove('active');
|
|
statusText.textContent = 'Disconnected';
|
|
addSystemMessage('Disconnected from server. Reconnecting...');
|
|
setTimeout(connectWebSocket, 3000);
|
|
};
|
|
|
|
ws.onerror = (error) => {
|
|
console.error('WebSocket error:', error);
|
|
statusDot.classList.remove('active');
|
|
statusText.textContent = 'Error';
|
|
addSystemMessage('Connection error');
|
|
};
|
|
}
|
|
|
|
// Toggle streaming
|
|
function toggleStreaming() {
|
|
if (isStreaming) {
|
|
stopStreaming();
|
|
} else {
|
|
startStreaming();
|
|
}
|
|
}
|
|
|
|
// Start streaming
|
|
async function startStreaming() {
|
|
try {
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
const speaker = parseInt(speakerSelectEl.value);
|
|
|
|
isStreaming = true;
|
|
isSpeaking = false;
|
|
energyWindow = [];
|
|
|
|
streamButton.textContent = 'Listening...';
|
|
streamButton.classList.add('recording');
|
|
|
|
// Create audio processor node
|
|
const source = audioContext.createMediaStreamSource(stream);
|
|
streamProcessor = audioContext.createScriptProcessor(4096, 1, 1);
|
|
|
|
// Process and send audio data
|
|
streamProcessor.onaudioprocess = function(e) {
|
|
const audioData = e.inputBuffer.getChannelData(0);
|
|
|
|
// Calculate energy (volume) for silence detection
|
|
const energy = calculateAudioEnergy(audioData);
|
|
updateEnergyWindow(energy);
|
|
|
|
// Check if currently silent
|
|
const avgEnergy = calculateAverageEnergy();
|
|
const isSilent = avgEnergy < CLIENT_SILENCE_THRESHOLD;
|
|
|
|
// Handle silence/speech transitions for visual feedback
|
|
handleSpeechState(isSilent);
|
|
|
|
// Continue processing audio regardless of silence state
|
|
const downsampled = downsampleBuffer(audioData, audioContext.sampleRate, 24000);
|
|
sendAudioChunk(downsampled, speaker);
|
|
};
|
|
|
|
// Connect the nodes
|
|
source.connect(streamProcessor);
|
|
streamProcessor.connect(audioContext.destination);
|
|
|
|
addSystemMessage('Listening - speak naturally and pause when finished');
|
|
|
|
} catch (err) {
|
|
console.error('Error starting audio stream:', err);
|
|
addSystemMessage(`Microphone error: ${err.message}`);
|
|
isStreaming = false;
|
|
streamButton.textContent = 'Start Conversation';
|
|
streamButton.classList.remove('recording');
|
|
}
|
|
}
|
|
|
|
// Calculate audio energy (volume)
|
|
function calculateAudioEnergy(buffer) {
|
|
let sum = 0;
|
|
for (let i = 0; i < buffer.length; i++) {
|
|
sum += Math.abs(buffer[i]);
|
|
}
|
|
return sum / buffer.length;
|
|
}
|
|
|
|
// Update the sliding energy window
|
|
function updateEnergyWindow(energy) {
|
|
energyWindow.push(energy);
|
|
if (energyWindow.length > ENERGY_WINDOW_SIZE) {
|
|
energyWindow.shift();
|
|
}
|
|
}
|
|
|
|
// Calculate average energy from the window
|
|
function calculateAverageEnergy() {
|
|
if (energyWindow.length === 0) return 0;
|
|
return energyWindow.reduce((sum, val) => sum + val, 0) / energyWindow.length;
|
|
}
|
|
|
|
// Handle speech state changes and visual feedback
|
|
function handleSpeechState(isSilent) {
|
|
if (isSpeaking && isSilent) {
|
|
// Transition from speaking to silence
|
|
if (!silenceTimer) {
|
|
silenceTimer = setTimeout(() => {
|
|
// Silence persisted long enough
|
|
streamButton.textContent = 'Processing...';
|
|
streamButton.classList.remove('recording');
|
|
streamButton.classList.add('processing');
|
|
addSystemMessage('Detected pause in speech, processing response...');
|
|
}, CLIENT_SILENCE_DURATION_MS);
|
|
}
|
|
} else if (!isSpeaking && !isSilent) {
|
|
// Transition from silence to speaking
|
|
isSpeaking = true;
|
|
streamButton.textContent = 'Listening...';
|
|
streamButton.classList.add('recording');
|
|
streamButton.classList.remove('processing');
|
|
|
|
// Clear any pending silence timer
|
|
if (silenceTimer) {
|
|
clearTimeout(silenceTimer);
|
|
silenceTimer = null;
|
|
}
|
|
} else if (isSpeaking && !isSilent) {
|
|
// Still speaking, reset any silence timer
|
|
if (silenceTimer) {
|
|
clearTimeout(silenceTimer);
|
|
silenceTimer = null;
|
|
}
|
|
}
|
|
|
|
// Update speaking state
|
|
if (!isSilent) {
|
|
isSpeaking = true;
|
|
}
|
|
}
|
|
|
|
// Send audio chunk to server
|
|
function sendAudioChunk(audioData, speaker) {
|
|
const wavData = createWavBlob(audioData, 24000);
|
|
const reader = new FileReader();
|
|
|
|
reader.onloadend = function() {
|
|
const base64data = reader.result;
|
|
|
|
// Send to server
|
|
ws.send(JSON.stringify({
|
|
action: 'stream_audio',
|
|
speaker: speaker,
|
|
audio: base64data
|
|
}));
|
|
};
|
|
|
|
reader.readAsDataURL(wavData);
|
|
}
|
|
|
|
// Stop streaming
|
|
function stopStreaming() {
|
|
if (streamProcessor) {
|
|
streamProcessor.disconnect();
|
|
streamProcessor = null;
|
|
}
|
|
|
|
// Clear any pending silence timer
|
|
if (silenceTimer) {
|
|
clearTimeout(silenceTimer);
|
|
silenceTimer = null;
|
|
}
|
|
|
|
isStreaming = false;
|
|
isSpeaking = false;
|
|
energyWindow = [];
|
|
|
|
streamButton.textContent = 'Start Conversation';
|
|
streamButton.classList.remove('recording', 'processing');
|
|
streamButton.style.backgroundColor = ''; // Reset to default
|
|
|
|
addSystemMessage('Conversation paused');
|
|
|
|
// Send stop streaming signal to server
|
|
ws.send(JSON.stringify({
|
|
action: 'stop_streaming',
|
|
speaker: parseInt(speakerSelectEl.value)
|
|
}));
|
|
}
|
|
|
|
// Clear conversation
|
|
function clearConversation() {
|
|
// Clear conversation history
|
|
ws.send(JSON.stringify({
|
|
action: 'clear_context'
|
|
}));
|
|
|
|
// Clear the UI
|
|
conversationEl.innerHTML = '';
|
|
addSystemMessage('Conversation cleared');
|
|
}
|
|
|
|
// Downsample audio buffer to target sample rate
|
|
function downsampleBuffer(buffer, sampleRate, targetSampleRate) {
|
|
if (targetSampleRate === sampleRate) {
|
|
return buffer;
|
|
}
|
|
|
|
const sampleRateRatio = sampleRate / targetSampleRate;
|
|
const newLength = Math.round(buffer.length / sampleRateRatio);
|
|
const result = new Float32Array(newLength);
|
|
|
|
let offsetResult = 0;
|
|
let offsetBuffer = 0;
|
|
|
|
while (offsetResult < result.length) {
|
|
const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
|
|
let accum = 0, count = 0;
|
|
|
|
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
|
|
accum += buffer[i];
|
|
count++;
|
|
}
|
|
|
|
result[offsetResult] = accum / count;
|
|
offsetResult++;
|
|
offsetBuffer = nextOffsetBuffer;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// Create WAV blob from Float32Array
|
|
function createWavBlob(samples, sampleRate) {
|
|
const buffer = new ArrayBuffer(44 + samples.length * 2);
|
|
const view = new DataView(buffer);
|
|
|
|
// RIFF chunk descriptor
|
|
writeString(view, 0, 'RIFF');
|
|
view.setUint32(4, 36 + samples.length * 2, true);
|
|
writeString(view, 8, 'WAVE');
|
|
|
|
// fmt sub-chunk
|
|
writeString(view, 12, 'fmt ');
|
|
view.setUint32(16, 16, true);
|
|
view.setUint16(20, 1, true); // PCM format
|
|
view.setUint16(22, 1, true); // Mono channel
|
|
view.setUint32(24, sampleRate, true);
|
|
view.setUint32(28, sampleRate * 2, true);
|
|
view.setUint16(32, 2, true);
|
|
view.setUint16(34, 16, true);
|
|
|
|
// data sub-chunk
|
|
writeString(view, 36, 'data');
|
|
view.setUint32(40, samples.length * 2, true);
|
|
|
|
// Write the PCM samples
|
|
const volume = 0.5;
|
|
for (let i = 0; i < samples.length; i++) {
|
|
const sample = Math.max(-1, Math.min(1, samples[i]));
|
|
view.setInt16(44 + i * 2, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true);
|
|
}
|
|
|
|
return new Blob([buffer], { type: 'audio/wav' });
|
|
}
|
|
|
|
function writeString(view, offset, string) {
|
|
for (let i = 0; i < string.length; i++) {
|
|
view.setUint8(offset + i, string.charCodeAt(i));
|
|
}
|
|
}
|
|
|
|
// Message display functions
|
|
function addUserTranscription(text) {
|
|
// Find if there's already a pending user message
|
|
let pendingMessage = document.querySelector('.message.user.pending');
|
|
|
|
if (!pendingMessage) {
|
|
// Create a new message
|
|
pendingMessage = document.createElement('div');
|
|
pendingMessage.classList.add('message', 'user', 'pending');
|
|
conversationEl.appendChild(pendingMessage);
|
|
}
|
|
|
|
pendingMessage.textContent = text;
|
|
pendingMessage.classList.remove('pending');
|
|
conversationEl.scrollTop = conversationEl.scrollHeight;
|
|
}
|
|
|
|
function addAIMessage(text, audioSrc) {
|
|
const messageEl = document.createElement('div');
|
|
messageEl.classList.add('message', 'ai');
|
|
|
|
if (text) {
|
|
const textDiv = document.createElement('div');
|
|
textDiv.textContent = text;
|
|
messageEl.appendChild(textDiv);
|
|
}
|
|
|
|
const audioEl = document.createElement('audio');
|
|
audioEl.controls = true;
|
|
audioEl.src = audioSrc;
|
|
messageEl.appendChild(audioEl);
|
|
|
|
conversationEl.appendChild(messageEl);
|
|
conversationEl.scrollTop = conversationEl.scrollHeight;
|
|
}
|
|
|
|
function addSystemMessage(text) {
|
|
const messageEl = document.createElement('div');
|
|
messageEl.classList.add('message', 'system');
|
|
messageEl.textContent = text;
|
|
conversationEl.appendChild(messageEl);
|
|
conversationEl.scrollTop = conversationEl.scrollHeight;
|
|
}
|
|
</script>
|
|
</body>
|
|
</html> |