Demo Update 3
This commit is contained in:
@@ -55,27 +55,71 @@ active_clients = {} # Map client_id to client context
|
|||||||
def decode_audio_data(audio_data: str) -> torch.Tensor:
|
def decode_audio_data(audio_data: str) -> torch.Tensor:
|
||||||
"""Decode base64 audio data to a torch tensor"""
|
"""Decode base64 audio data to a torch tensor"""
|
||||||
try:
|
try:
|
||||||
|
# Skip empty audio data
|
||||||
|
if not audio_data:
|
||||||
|
print("Empty audio data received")
|
||||||
|
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
|
||||||
|
|
||||||
# Extract the actual base64 content
|
# Extract the actual base64 content
|
||||||
if ',' in audio_data:
|
if ',' in audio_data:
|
||||||
audio_data = audio_data.split(',')[1]
|
audio_data = audio_data.split(',')[1]
|
||||||
|
|
||||||
# Decode base64 audio data
|
# Decode base64 audio data
|
||||||
binary_data = base64.b64decode(audio_data)
|
try:
|
||||||
|
binary_data = base64.b64decode(audio_data)
|
||||||
|
print(f"Decoded base64 data: {len(binary_data)} bytes")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Base64 decoding error: {str(e)}")
|
||||||
|
return torch.zeros(generator.sample_rate // 2)
|
||||||
|
|
||||||
|
# Debug: save the raw binary data to examine with external tools
|
||||||
|
debug_path = os.path.join(base_dir, "debug_incoming.wav")
|
||||||
|
with open(debug_path, 'wb') as f:
|
||||||
|
f.write(binary_data)
|
||||||
|
print(f"Saved debug file to {debug_path}")
|
||||||
|
|
||||||
# Load audio from binary data
|
# Load audio from binary data
|
||||||
with BytesIO(binary_data) as temp_file:
|
try:
|
||||||
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
|
with BytesIO(binary_data) as temp_file:
|
||||||
|
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
|
||||||
|
print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
|
||||||
|
|
||||||
|
# Check if audio is valid
|
||||||
|
if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
|
||||||
|
print("Warning: Empty or invalid audio data detected")
|
||||||
|
return torch.zeros(generator.sample_rate // 2)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Audio loading error: {str(e)}")
|
||||||
|
# Try saving to a temporary file instead of loading from BytesIO
|
||||||
|
try:
|
||||||
|
temp_path = os.path.join(base_dir, "temp_incoming.wav")
|
||||||
|
with open(temp_path, 'wb') as f:
|
||||||
|
f.write(binary_data)
|
||||||
|
print(f"Trying to load from file: {temp_path}")
|
||||||
|
audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
|
||||||
|
print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
|
||||||
|
os.remove(temp_path)
|
||||||
|
except Exception as e2:
|
||||||
|
print(f"Secondary audio loading error: {str(e2)}")
|
||||||
|
return torch.zeros(generator.sample_rate // 2)
|
||||||
|
|
||||||
# Resample if needed
|
# Resample if needed
|
||||||
if sample_rate != generator.sample_rate:
|
if sample_rate != generator.sample_rate:
|
||||||
audio_tensor = torchaudio.functional.resample(
|
try:
|
||||||
audio_tensor.squeeze(0),
|
print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
|
||||||
orig_freq=sample_rate,
|
audio_tensor = torchaudio.functional.resample(
|
||||||
new_freq=generator.sample_rate
|
audio_tensor.squeeze(0),
|
||||||
)
|
orig_freq=sample_rate,
|
||||||
|
new_freq=generator.sample_rate
|
||||||
|
)
|
||||||
|
print(f"Resampled audio shape: {audio_tensor.shape}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Resampling error: {str(e)}")
|
||||||
|
return torch.zeros(generator.sample_rate // 2)
|
||||||
else:
|
else:
|
||||||
audio_tensor = audio_tensor.squeeze(0)
|
audio_tensor = audio_tensor.squeeze(0)
|
||||||
|
|
||||||
|
print(f"Final audio tensor shape: {audio_tensor.shape}")
|
||||||
return audio_tensor
|
return audio_tensor
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error decoding audio: {str(e)}")
|
print(f"Error decoding audio: {str(e)}")
|
||||||
|
|||||||
@@ -70,88 +70,18 @@ function initializeApp() {
|
|||||||
|
|
||||||
// Initialize UI elements
|
// Initialize UI elements
|
||||||
function initializeUIElements() {
|
function initializeUIElements() {
|
||||||
// Main UI containers
|
|
||||||
const chatContainer = document.querySelector('.chat-container');
|
|
||||||
const controlPanel = document.querySelector('.control-panel');
|
|
||||||
|
|
||||||
// Create conversation section
|
|
||||||
chatContainer.innerHTML = `
|
|
||||||
<div class="chat-header">
|
|
||||||
<h2>Conversation</h2>
|
|
||||||
<div class="status-indicator">
|
|
||||||
<div class="status-dot"></div>
|
|
||||||
<span class="status-text">Disconnected</span>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div class="conversation"></div>
|
|
||||||
`;
|
|
||||||
|
|
||||||
// Create control panel
|
|
||||||
controlPanel.innerHTML = `
|
|
||||||
<div class="visualizer-section">
|
|
||||||
<div class="visualizer-container">
|
|
||||||
<canvas id="audioVisualizer"></canvas>
|
|
||||||
<div class="visualizer-label">Speak to see audio visualization</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="controls">
|
|
||||||
<div class="control-group">
|
|
||||||
<div class="control-label">Voice Controls</div>
|
|
||||||
|
|
||||||
<div class="volume-indicator">
|
|
||||||
<div class="volume-level" style="width:0%"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="slider-container">
|
|
||||||
<div class="slider-label">
|
|
||||||
<span>Silence Threshold</span>
|
|
||||||
<span id="thresholdValue">0.01</span>
|
|
||||||
</div>
|
|
||||||
<input type="range" id="thresholdSlider" min="0.001" max="0.05" step="0.001" value="0.01">
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<select id="speakerSelection">
|
|
||||||
<option value="0">Speaker 1 (You)</option>
|
|
||||||
<option value="1">Speaker 2 (Alternative)</option>
|
|
||||||
</select>
|
|
||||||
|
|
||||||
<div class="button-row">
|
|
||||||
<button id="streamButton"><i class="fas fa-microphone"></i> Start Conversation</button>
|
|
||||||
<button id="clearButton"><i class="fas fa-trash"></i> Clear</button>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="control-group settings-panel">
|
|
||||||
<div class="control-label">Settings</div>
|
|
||||||
|
|
||||||
<div class="settings-toggles">
|
|
||||||
<div class="toggle-switch">
|
|
||||||
<input type="checkbox" id="autoPlayResponses" checked>
|
|
||||||
<label for="autoPlayResponses">Auto-play AI responses</label>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="toggle-switch">
|
|
||||||
<input type="checkbox" id="showVisualizer" checked>
|
|
||||||
<label for="showVisualizer">Show audio visualizer</label>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
`;
|
|
||||||
|
|
||||||
// Store references to UI elements
|
// Store references to UI elements
|
||||||
elements.conversation = document.querySelector('.conversation');
|
elements.conversation = document.getElementById('conversation');
|
||||||
elements.streamButton = document.getElementById('streamButton');
|
elements.streamButton = document.getElementById('streamButton');
|
||||||
elements.clearButton = document.getElementById('clearButton');
|
elements.clearButton = document.getElementById('clearButton');
|
||||||
elements.thresholdSlider = document.getElementById('thresholdSlider');
|
elements.thresholdSlider = document.getElementById('thresholdSlider');
|
||||||
elements.thresholdValue = document.getElementById('thresholdValue');
|
elements.thresholdValue = document.getElementById('thresholdValue');
|
||||||
elements.visualizerCanvas = document.getElementById('audioVisualizer');
|
elements.visualizerCanvas = document.getElementById('audioVisualizer');
|
||||||
elements.visualizerLabel = document.querySelector('.visualizer-label');
|
elements.visualizerLabel = document.getElementById('visualizerLabel');
|
||||||
elements.volumeLevel = document.querySelector('.volume-level');
|
elements.volumeLevel = document.getElementById('volumeLevel');
|
||||||
elements.statusDot = document.querySelector('.status-dot');
|
elements.statusDot = document.getElementById('statusDot');
|
||||||
elements.statusText = document.querySelector('.status-text');
|
elements.statusText = document.getElementById('statusText');
|
||||||
elements.speakerSelection = document.getElementById('speakerSelection');
|
elements.speakerSelection = document.getElementById('speakerSelect'); // Changed to match HTML
|
||||||
elements.autoPlayResponses = document.getElementById('autoPlayResponses');
|
elements.autoPlayResponses = document.getElementById('autoPlayResponses');
|
||||||
elements.showVisualizer = document.getElementById('showVisualizer');
|
elements.showVisualizer = document.getElementById('showVisualizer');
|
||||||
}
|
}
|
||||||
@@ -364,8 +294,12 @@ function stopStreaming(notifyServer = true) {
|
|||||||
function handleAudioProcess(event) {
|
function handleAudioProcess(event) {
|
||||||
const inputData = event.inputBuffer.getChannelData(0);
|
const inputData = event.inputBuffer.getChannelData(0);
|
||||||
|
|
||||||
|
// Log audio buffer statistics
|
||||||
|
console.log(`Audio buffer: length=${inputData.length}, sample rate=${state.audioContext.sampleRate}Hz`);
|
||||||
|
|
||||||
// Calculate audio energy (volume level)
|
// Calculate audio energy (volume level)
|
||||||
const energy = calculateAudioEnergy(inputData);
|
const energy = calculateAudioEnergy(inputData);
|
||||||
|
console.log(`Energy: ${energy.toFixed(6)}, threshold: ${state.silenceThreshold}`);
|
||||||
|
|
||||||
// Update energy window for averaging
|
// Update energy window for averaging
|
||||||
updateEnergyWindow(energy);
|
updateEnergyWindow(energy);
|
||||||
@@ -375,6 +309,7 @@ function handleAudioProcess(event) {
|
|||||||
|
|
||||||
// Determine if audio is silent
|
// Determine if audio is silent
|
||||||
const isSilent = avgEnergy < state.silenceThreshold;
|
const isSilent = avgEnergy < state.silenceThreshold;
|
||||||
|
console.log(`Silent: ${isSilent ? 'Yes' : 'No'}, avg energy: ${avgEnergy.toFixed(6)}`);
|
||||||
|
|
||||||
// Handle speech state based on silence
|
// Handle speech state based on silence
|
||||||
handleSpeechState(isSilent);
|
handleSpeechState(isSilent);
|
||||||
@@ -384,6 +319,7 @@ function handleAudioProcess(event) {
|
|||||||
// Create a resampled version at 24kHz for the server
|
// Create a resampled version at 24kHz for the server
|
||||||
// Most WebRTC audio is 48kHz, but we want 24kHz for the model
|
// Most WebRTC audio is 48kHz, but we want 24kHz for the model
|
||||||
const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
|
const resampledData = downsampleBuffer(inputData, state.audioContext.sampleRate, 24000);
|
||||||
|
console.log(`Resampled audio: ${state.audioContext.sampleRate}Hz → 24000Hz, new length: ${resampledData.length}`);
|
||||||
|
|
||||||
// Send the audio chunk to the server
|
// Send the audio chunk to the server
|
||||||
sendAudioChunk(resampledData, state.currentSpeaker);
|
sendAudioChunk(resampledData, state.currentSpeaker);
|
||||||
@@ -530,20 +466,132 @@ function sendAudioChunk(audioData, speaker) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const wavData = createWavBlob(audioData, 24000);
|
console.log(`Creating WAV from audio data: length=${audioData.length}`);
|
||||||
const reader = new FileReader();
|
|
||||||
|
|
||||||
reader.onloadend = function() {
|
// Check for NaN or invalid values
|
||||||
const base64data = reader.result;
|
let hasNaN = false;
|
||||||
|
let min = Infinity;
|
||||||
|
let max = -Infinity;
|
||||||
|
let sum = 0;
|
||||||
|
|
||||||
// Send the audio chunk to the server
|
for (let i = 0; i < audioData.length; i++) {
|
||||||
state.socket.emit('stream_audio', {
|
if (isNaN(audioData[i]) || !isFinite(audioData[i])) {
|
||||||
audio: base64data,
|
hasNaN = true;
|
||||||
speaker: speaker
|
console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`);
|
||||||
});
|
break;
|
||||||
};
|
}
|
||||||
|
min = Math.min(min, audioData[i]);
|
||||||
|
max = Math.max(max, audioData[i]);
|
||||||
|
sum += audioData[i];
|
||||||
|
}
|
||||||
|
|
||||||
reader.readAsDataURL(wavData);
|
if (hasNaN) {
|
||||||
|
console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.');
|
||||||
|
audioData = new Float32Array(audioData.length).fill(0);
|
||||||
|
} else {
|
||||||
|
const avg = sum / audioData.length;
|
||||||
|
console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Create WAV blob with proper format
|
||||||
|
const wavData = createWavBlob(audioData, 24000);
|
||||||
|
console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`);
|
||||||
|
|
||||||
|
const reader = new FileReader();
|
||||||
|
|
||||||
|
reader.onloadend = function() {
|
||||||
|
try {
|
||||||
|
// Get base64 data
|
||||||
|
const base64data = reader.result;
|
||||||
|
console.log(`Base64 data created: length=${base64data.length}`);
|
||||||
|
|
||||||
|
// Validate the base64 data before sending
|
||||||
|
if (!base64data || base64data.length < 100) {
|
||||||
|
console.warn('Generated base64 data is too small or invalid');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send the audio chunk to the server
|
||||||
|
console.log('Sending audio data to server...');
|
||||||
|
state.socket.emit('stream_audio', {
|
||||||
|
audio: base64data,
|
||||||
|
speaker: speaker
|
||||||
|
});
|
||||||
|
console.log('Audio data sent successfully');
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Error preparing audio data:', err);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
reader.onerror = function(err) {
|
||||||
|
console.error('Error reading audio data:', err);
|
||||||
|
};
|
||||||
|
|
||||||
|
reader.readAsDataURL(wavData);
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Error creating WAV data:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create WAV blob from audio data with validation
|
||||||
|
function createWavBlob(audioData, sampleRate) {
|
||||||
|
// Check if audio data is valid
|
||||||
|
if (!audioData || audioData.length === 0) {
|
||||||
|
console.warn('Empty audio data received');
|
||||||
|
// Return a tiny silent audio snippet instead
|
||||||
|
audioData = new Float32Array(100).fill(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to convert Float32Array to Int16Array for WAV format
|
||||||
|
function floatTo16BitPCM(output, offset, input) {
|
||||||
|
for (let i = 0; i < input.length; i++, offset += 2) {
|
||||||
|
const s = Math.max(-1, Math.min(1, input[i]));
|
||||||
|
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create WAV header
|
||||||
|
function writeString(view, offset, string) {
|
||||||
|
for (let i = 0; i < string.length; i++) {
|
||||||
|
view.setUint8(offset + i, string.charCodeAt(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create WAV file with header
|
||||||
|
function encodeWAV(samples) {
|
||||||
|
const buffer = new ArrayBuffer(44 + samples.length * 2);
|
||||||
|
const view = new DataView(buffer);
|
||||||
|
|
||||||
|
// RIFF chunk descriptor
|
||||||
|
writeString(view, 0, 'RIFF');
|
||||||
|
view.setUint32(4, 36 + samples.length * 2, true);
|
||||||
|
writeString(view, 8, 'WAVE');
|
||||||
|
|
||||||
|
// fmt sub-chunk
|
||||||
|
writeString(view, 12, 'fmt ');
|
||||||
|
view.setUint32(16, 16, true);
|
||||||
|
view.setUint16(20, 1, true); // PCM format
|
||||||
|
view.setUint16(22, 1, true); // Mono channel
|
||||||
|
view.setUint32(24, sampleRate, true);
|
||||||
|
view.setUint32(28, sampleRate * 2, true); // Byte rate
|
||||||
|
view.setUint16(32, 2, true); // Block align
|
||||||
|
view.setUint16(34, 16, true); // Bits per sample
|
||||||
|
|
||||||
|
// data sub-chunk
|
||||||
|
writeString(view, 36, 'data');
|
||||||
|
view.setUint32(40, samples.length * 2, true);
|
||||||
|
floatTo16BitPCM(view, 44, samples);
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert audio data to TypedArray if it's a regular Array
|
||||||
|
const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
|
||||||
|
|
||||||
|
// Create WAV blob
|
||||||
|
const wavBuffer = encodeWAV(samples);
|
||||||
|
return new Blob([wavBuffer], { type: 'audio/wav' });
|
||||||
}
|
}
|
||||||
|
|
||||||
// Draw audio visualizer
|
// Draw audio visualizer
|
||||||
@@ -757,59 +805,6 @@ function addSystemMessage(message) {
|
|||||||
elements.conversation.scrollTop = elements.conversation.scrollHeight;
|
elements.conversation.scrollTop = elements.conversation.scrollHeight;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create WAV blob from audio data
|
|
||||||
function createWavBlob(audioData, sampleRate) {
|
|
||||||
// Function to convert Float32Array to Int16Array for WAV format
|
|
||||||
function floatTo16BitPCM(output, offset, input) {
|
|
||||||
for (let i = 0; i < input.length; i++, offset += 2) {
|
|
||||||
const s = Math.max(-1, Math.min(1, input[i]));
|
|
||||||
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create WAV header
|
|
||||||
function writeString(view, offset, string) {
|
|
||||||
for (let i = 0; i < string.length; i++) {
|
|
||||||
view.setUint8(offset + i, string.charCodeAt(i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create WAV file with header
|
|
||||||
function encodeWAV(samples) {
|
|
||||||
const buffer = new ArrayBuffer(44 + samples.length * 2);
|
|
||||||
const view = new DataView(buffer);
|
|
||||||
|
|
||||||
// RIFF chunk descriptor
|
|
||||||
writeString(view, 0, 'RIFF');
|
|
||||||
view.setUint32(4, 36 + samples.length * 2, true);
|
|
||||||
writeString(view, 8, 'WAVE');
|
|
||||||
|
|
||||||
// fmt sub-chunk
|
|
||||||
writeString(view, 12, 'fmt ');
|
|
||||||
view.setUint32(16, 16, true);
|
|
||||||
view.setUint16(20, 1, true); // PCM format
|
|
||||||
view.setUint16(22, 1, true); // Mono channel
|
|
||||||
view.setUint32(24, sampleRate, true);
|
|
||||||
view.setUint32(28, sampleRate * 2, true); // Byte rate
|
|
||||||
view.setUint16(32, 2, true); // Block align
|
|
||||||
view.setUint16(34, 16, true); // Bits per sample
|
|
||||||
|
|
||||||
// data sub-chunk
|
|
||||||
writeString(view, 36, 'data');
|
|
||||||
view.setUint32(40, samples.length * 2, true);
|
|
||||||
floatTo16BitPCM(view, 44, samples);
|
|
||||||
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert audio data to TypedArray if it's a regular Array
|
|
||||||
const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
|
|
||||||
|
|
||||||
// Create WAV blob
|
|
||||||
const wavBuffer = encodeWAV(samples);
|
|
||||||
return new Blob([wavBuffer], { type: 'audio/wav' });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Downsample audio buffer to target sample rate
|
// Downsample audio buffer to target sample rate
|
||||||
function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
|
function downsampleBuffer(buffer, originalSampleRate, targetSampleRate) {
|
||||||
if (originalSampleRate === targetSampleRate) {
|
if (originalSampleRate === targetSampleRate) {
|
||||||
|
|||||||
Reference in New Issue
Block a user