Demo Update 3

This commit is contained in:
2025-03-29 23:54:02 -04:00
parent b74ae2dbfc
commit eef7da454a
2 changed files with 320 additions and 112 deletions

View File

@@ -16,6 +16,28 @@ import gc
from collections import deque from collections import deque
from threading import Lock from threading import Lock
# Add these lines right after your imports
import torch
import os
# Handle CUDA issues
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only
torch.backends.cudnn.benchmark = True
# Set CUDA settings to avoid TF32 warnings
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# Set compute type based on available hardware
if torch.cuda.is_available():
device = "cuda"
compute_type = "float16" # Faster for CUDA
else:
device = "cpu"
compute_type = "int8" # Better for CPU
print(f"Using device: {device} with compute type: {compute_type}")
# Select device # Select device
if torch.cuda.is_available(): if torch.cuda.is_available():
device = "cuda" device = "cuda"
@@ -28,9 +50,22 @@ generator = load_csm_1b(device=device)
# Initialize WhisperX for ASR # Initialize WhisperX for ASR
print("Loading WhisperX model...") print("Loading WhisperX model...")
# Use a smaller model for faster response times try:
asr_model = whisperx.load_model("medium", device, compute_type="float16") # Try to load a smaller model for faster response times
print("WhisperX model loaded!") asr_model = whisperx.load_model("small", device, compute_type=compute_type)
print("WhisperX 'small' model loaded successfully")
except Exception as e:
print(f"Error loading 'small' model: {str(e)}")
try:
# Fall back to tiny model if small fails
asr_model = whisperx.load_model("tiny", device, compute_type=compute_type)
print("WhisperX 'tiny' model loaded as fallback")
except Exception as e2:
print(f"Error loading fallback model: {str(e2)}")
print("Trying CPU model as last resort")
# Last resort - try CPU
asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
print("WhisperX loaded on CPU as last resort")
# Silence detection parameters # Silence detection parameters
SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization
@@ -53,76 +88,130 @@ active_clients = {} # Map client_id to client context
# Helper function to convert audio data # Helper function to convert audio data
def decode_audio_data(audio_data: str) -> torch.Tensor: def decode_audio_data(audio_data: str) -> torch.Tensor:
"""Decode base64 audio data to a torch tensor""" """Decode base64 audio data to a torch tensor with improved error handling"""
try: try:
# Skip empty audio data # Skip empty audio data
if not audio_data: if not audio_data or len(audio_data) < 100:
print("Empty audio data received") print("Empty or too short audio data received")
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
# Extract the actual base64 content # Extract the actual base64 content
if ',' in audio_data: if ',' in audio_data:
# Handle data URL format (data:audio/wav;base64,...)
audio_data = audio_data.split(',')[1] audio_data = audio_data.split(',')[1]
# Decode base64 audio data # Decode base64 audio data
try: try:
binary_data = base64.b64decode(audio_data) binary_data = base64.b64decode(audio_data)
print(f"Decoded base64 data: {len(binary_data)} bytes") print(f"Decoded base64 data: {len(binary_data)} bytes")
# Check if we have enough data for a valid WAV
if len(binary_data) < 44: # WAV header is 44 bytes
print("Data too small to be a valid WAV file")
return torch.zeros(generator.sample_rate // 2)
except Exception as e: except Exception as e:
print(f"Base64 decoding error: {str(e)}") print(f"Base64 decoding error: {str(e)}")
return torch.zeros(generator.sample_rate // 2) return torch.zeros(generator.sample_rate // 2)
# Debug: save the raw binary data to examine with external tools # Save for debugging
debug_path = os.path.join(base_dir, "debug_incoming.wav") debug_path = os.path.join(base_dir, "debug_incoming.wav")
with open(debug_path, 'wb') as f: with open(debug_path, 'wb') as f:
f.write(binary_data) f.write(binary_data)
print(f"Saved debug file to {debug_path}") print(f"Saved debug file: {debug_path}")
# Load audio from binary data # Approach 1: Load directly with torchaudio
try: try:
with BytesIO(binary_data) as temp_file: with BytesIO(binary_data) as temp_file:
temp_file.seek(0) # Ensure we're at the start of the buffer
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav") audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz") print(f"Direct loading success: shape={audio_tensor.shape}, rate={sample_rate}Hz")
# Check if audio is valid # Check if audio is valid
if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any(): if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
print("Warning: Empty or invalid audio data detected") raise ValueError("Empty or invalid audio tensor detected")
return torch.zeros(generator.sample_rate // 2)
except Exception as e: except Exception as e:
print(f"Audio loading error: {str(e)}") print(f"Direct loading failed: {str(e)}")
# Try saving to a temporary file instead of loading from BytesIO
# Approach 2: Try to fix/normalize the WAV data
try: try:
temp_path = os.path.join(base_dir, "temp_incoming.wav") # Sometimes WAV headers can be malformed, attempt to fix
temp_path = os.path.join(base_dir, "temp_fixing.wav")
with open(temp_path, 'wb') as f: with open(temp_path, 'wb') as f:
f.write(binary_data) f.write(binary_data)
print(f"Trying to load from file: {temp_path}")
audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav") # Use a simpler numpy approach as backup
print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz") import numpy as np
os.remove(temp_path) import wave
try:
with wave.open(temp_path, 'rb') as wf:
n_channels = wf.getnchannels()
sample_width = wf.getsampwidth()
sample_rate = wf.getframerate()
n_frames = wf.getnframes()
# Read the frames
frames = wf.readframes(n_frames)
print(f"Wave reading: channels={n_channels}, rate={sample_rate}Hz, frames={n_frames}")
# Convert to numpy and then to torch
if sample_width == 2: # 16-bit audio
data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
elif sample_width == 1: # 8-bit audio
data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0
else:
raise ValueError(f"Unsupported sample width: {sample_width}")
# Convert to mono if needed
if n_channels > 1:
data = data.reshape(-1, n_channels)
data = data.mean(axis=1)
# Convert to torch tensor
audio_tensor = torch.from_numpy(data)
print(f"Successfully converted with numpy: shape={audio_tensor.shape}")
except Exception as wave_error:
print(f"Wave processing failed: {str(wave_error)}")
# Try with torchaudio as last resort
audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
# Clean up
if os.path.exists(temp_path):
os.remove(temp_path)
except Exception as e2: except Exception as e2:
print(f"Secondary audio loading error: {str(e2)}") print(f"All WAV loading methods failed: {str(e2)}")
print("Returning silence as fallback")
return torch.zeros(generator.sample_rate // 2) return torch.zeros(generator.sample_rate // 2)
# Ensure audio is the right shape (mono)
if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1:
audio_tensor = torch.mean(audio_tensor, dim=0)
# Ensure we have a 1D tensor
audio_tensor = audio_tensor.squeeze()
# Resample if needed # Resample if needed
if sample_rate != generator.sample_rate: if sample_rate != generator.sample_rate:
try: try:
print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz") print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
audio_tensor = torchaudio.functional.resample( resampler = torchaudio.transforms.Resample(
audio_tensor.squeeze(0),
orig_freq=sample_rate, orig_freq=sample_rate,
new_freq=generator.sample_rate new_freq=generator.sample_rate
) )
print(f"Resampled audio shape: {audio_tensor.shape}") audio_tensor = resampler(audio_tensor)
except Exception as e: except Exception as e:
print(f"Resampling error: {str(e)}") print(f"Resampling error: {str(e)}")
return torch.zeros(generator.sample_rate // 2) # If resampling fails, just return the original audio
else: # The model can often handle different sample rates
audio_tensor = audio_tensor.squeeze(0)
print(f"Final audio tensor shape: {audio_tensor.shape}") # Normalize audio to avoid issues
if torch.abs(audio_tensor).max() > 0:
audio_tensor = audio_tensor / torch.abs(audio_tensor).max()
print(f"Final audio tensor: shape={audio_tensor.shape}, min={audio_tensor.min().item():.4f}, max={audio_tensor.max().item():.4f}")
return audio_tensor return audio_tensor
except Exception as e: except Exception as e:
print(f"Error decoding audio: {str(e)}") print(f"Unhandled error in decode_audio_data: {str(e)}")
# Return a small silent audio segment as fallback # Return a small silent audio segment as fallback
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
@@ -143,6 +232,8 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
temp_path = os.path.join(base_dir, "temp_audio.wav") temp_path = os.path.join(base_dir, "temp_audio.wav")
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate) torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)")
# Load and transcribe the audio # Load and transcribe the audio
audio = whisperx.load_audio(temp_path) audio = whisperx.load_audio(temp_path)
result = asr_model.transcribe(audio, batch_size=16) result = asr_model.transcribe(audio, batch_size=16)
@@ -155,11 +246,15 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
if result["segments"] and len(result["segments"]) > 0: if result["segments"] and len(result["segments"]) > 0:
# Combine all segments # Combine all segments
transcription = " ".join([segment["text"] for segment in result["segments"]]) transcription = " ".join([segment["text"] for segment in result["segments"]])
print(f"Transcription successful: '{transcription.strip()}'")
return transcription.strip() return transcription.strip()
else: else:
print("Transcription returned no segments")
return "" return ""
except Exception as e: except Exception as e:
print(f"Error in transcription: {str(e)}") print(f"Error in transcription: {str(e)}")
import traceback
traceback.print_exc()
if os.path.exists("temp_audio.wav"): if os.path.exists("temp_audio.wav"):
os.remove("temp_audio.wav") os.remove("temp_audio.wav")
return "" return ""
@@ -385,43 +480,73 @@ def handle_stream_audio(data):
# Log the transcription # Log the transcription
print(f"[{client_id}] Transcribed text: '{transcribed_text}'") print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
# Add to conversation context # Handle the transcription result
if transcribed_text: if transcribed_text:
# Add user message to context
user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio) user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
client['context_segments'].append(user_segment) client['context_segments'].append(user_segment)
# Generate a contextual response
response_text = generate_response(transcribed_text, client['context_segments'])
# Send the transcribed text to client # Send the transcribed text to client
emit('transcription', { emit('transcription', {
'type': 'transcription', 'type': 'transcription',
'text': transcribed_text 'text': transcribed_text
}) })
# Generate audio for the response # Generate a contextual response
audio_tensor = generator.generate( response_text = generate_response(transcribed_text, client['context_segments'])
text=response_text, print(f"[{client_id}] Generating audio response: '{response_text}'")
speaker=1 if speaker_id == 0 else 0, # Use opposite speaker
context=client['context_segments'],
max_audio_length_ms=10_000,
)
# Add response to context # Let the client know we're processing
ai_segment = Segment( emit('processing_status', {
text=response_text, 'type': 'processing_status',
speaker=1 if speaker_id == 0 else 0, 'status': 'generating_audio',
audio=audio_tensor 'message': 'Generating audio response...'
)
client['context_segments'].append(ai_segment)
# Convert audio to base64 and send back to client
audio_base64 = encode_audio_data(audio_tensor)
emit('audio_response', {
'type': 'audio_response',
'text': response_text,
'audio': audio_base64
}) })
# Generate audio for the response
try:
# Use a different speaker than the user
ai_speaker_id = 1 if speaker_id == 0 else 0
# Start audio generation with streaming (chunk by chunk)
audio_chunks = []
# This version tries to stream the audio generation in smaller chunks
# Note: CSM model doesn't natively support incremental generation,
# so we're simulating it here for a more responsive UI experience
# Generate the full response
audio_tensor = generator.generate(
text=response_text,
speaker=ai_speaker_id,
context=client['context_segments'],
max_audio_length_ms=10_000,
)
# Add response to context
ai_segment = Segment(
text=response_text,
speaker=ai_speaker_id,
audio=audio_tensor
)
client['context_segments'].append(ai_segment)
# Convert audio to base64 and send back to client
audio_base64 = encode_audio_data(audio_tensor)
emit('audio_response', {
'type': 'audio_response',
'text': response_text,
'audio': audio_base64
})
print(f"[{client_id}] Audio response sent: {len(audio_base64)} bytes")
except Exception as gen_error:
print(f"Error generating audio response: {str(gen_error)}")
emit('error', {
'type': 'error',
'message': "Sorry, there was an error generating the audio response."
})
else: else:
# If transcription failed, send a generic response # If transcription failed, send a generic response
emit('error', { emit('error', {
@@ -437,6 +562,7 @@ def handle_stream_audio(data):
# If buffer gets too large without silence, process it anyway # If buffer gets too large without silence, process it anyway
elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec
print(f"[{client_id}] Processing long audio segment without silence")
full_audio = torch.cat(client['streaming_buffer'], dim=0) full_audio = torch.cat(client['streaming_buffer'], dim=0)
# Process with WhisperX speech-to-text # Process with WhisperX speech-to-text
@@ -453,7 +579,9 @@ def handle_stream_audio(data):
'text': transcribed_text + " (processing continued speech...)" 'text': transcribed_text + " (processing continued speech...)"
}) })
client['streaming_buffer'] = [] # Keep half of the buffer for context (sliding window approach)
half_point = len(client['streaming_buffer']) // 2
client['streaming_buffer'] = client['streaming_buffer'][half_point:]
except Exception as e: except Exception as e:
import traceback import traceback
@@ -497,6 +625,62 @@ def handle_stop_streaming(data):
'status': 'stopped' 'status': 'stopped'
}) })
def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=500):
"""Stream audio to client in chunks to simulate real-time generation"""
try:
if client_id not in active_clients:
print(f"Client {client_id} not found for streaming")
return
# Calculate chunk size in samples
chunk_size = int(generator.sample_rate * chunk_size_ms / 1000)
total_chunks = math.ceil(audio_tensor.size(0) / chunk_size)
print(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each")
# Send initial response with text but no audio yet
socketio.emit('audio_response_start', {
'type': 'audio_response_start',
'text': text,
'total_chunks': total_chunks
}, room=client_id)
# Stream each chunk
for i in range(total_chunks):
start_idx = i * chunk_size
end_idx = min(start_idx + chunk_size, audio_tensor.size(0))
# Extract chunk
chunk = audio_tensor[start_idx:end_idx]
# Encode chunk
chunk_base64 = encode_audio_data(chunk)
# Send chunk
socketio.emit('audio_response_chunk', {
'type': 'audio_response_chunk',
'chunk_index': i,
'total_chunks': total_chunks,
'audio': chunk_base64,
'is_last': i == total_chunks - 1
}, room=client_id)
# Brief pause between chunks to simulate streaming
time.sleep(0.1)
# Send completion message
socketio.emit('audio_response_complete', {
'type': 'audio_response_complete',
'text': text
}, room=client_id)
print(f"Audio streaming complete: {total_chunks} chunks sent")
except Exception as e:
print(f"Error streaming audio to client: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__": if __name__ == "__main__":
print(f"\n{'='*60}") print(f"\n{'='*60}")
print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)") print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")

View File

@@ -466,37 +466,27 @@ function sendAudioChunk(audioData, speaker) {
return; return;
} }
console.log(`Creating WAV from audio data: length=${audioData.length}`); console.log(`Preparing audio chunk: length=${audioData.length}, speaker=${speaker}`);
// Check for NaN or invalid values // Check for NaN or invalid values
let hasNaN = false; let hasInvalidValues = false;
let min = Infinity;
let max = -Infinity;
let sum = 0;
for (let i = 0; i < audioData.length; i++) { for (let i = 0; i < audioData.length; i++) {
if (isNaN(audioData[i]) || !isFinite(audioData[i])) { if (isNaN(audioData[i]) || !isFinite(audioData[i])) {
hasNaN = true; hasInvalidValues = true;
console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`); console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`);
break; break;
} }
min = Math.min(min, audioData[i]);
max = Math.max(max, audioData[i]);
sum += audioData[i];
} }
if (hasNaN) { if (hasInvalidValues) {
console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.'); console.warn('Audio data contains invalid values. Creating silent audio.');
audioData = new Float32Array(audioData.length).fill(0); audioData = new Float32Array(audioData.length).fill(0);
} else {
const avg = sum / audioData.length;
console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`);
} }
try { try {
// Create WAV blob with proper format // Create WAV blob
const wavData = createWavBlob(audioData, 24000); const wavData = createWavBlob(audioData, 24000);
console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`); console.log(`WAV blob created: ${wavData.size} bytes`);
const reader = new FileReader(); const reader = new FileReader();
@@ -504,28 +494,21 @@ function sendAudioChunk(audioData, speaker) {
try { try {
// Get base64 data // Get base64 data
const base64data = reader.result; const base64data = reader.result;
console.log(`Base64 data created: length=${base64data.length}`); console.log(`Base64 data created: ${base64data.length} bytes`);
// Validate the base64 data before sending // Send to server
if (!base64data || base64data.length < 100) {
console.warn('Generated base64 data is too small or invalid');
return;
}
// Send the audio chunk to the server
console.log('Sending audio data to server...');
state.socket.emit('stream_audio', { state.socket.emit('stream_audio', {
audio: base64data, audio: base64data,
speaker: speaker speaker: speaker
}); });
console.log('Audio data sent successfully'); console.log('Audio chunk sent to server');
} catch (err) { } catch (err) {
console.error('Error preparing audio data:', err); console.error('Error preparing audio data:', err);
} }
}; };
reader.onerror = function(err) { reader.onerror = function() {
console.error('Error reading audio data:', err); console.error('Error reading audio data as base64');
}; };
reader.readAsDataURL(wavData); reader.readAsDataURL(wavData);
@@ -534,19 +517,20 @@ function sendAudioChunk(audioData, speaker) {
} }
} }
// Create WAV blob from audio data with validation // Create WAV blob from audio data with improved error handling
function createWavBlob(audioData, sampleRate) { function createWavBlob(audioData, sampleRate) {
// Check if audio data is valid // Validate input
if (!audioData || audioData.length === 0) { if (!audioData || audioData.length === 0) {
console.warn('Empty audio data received'); console.warn('Empty audio data provided to createWavBlob');
// Return a tiny silent audio snippet instead audioData = new Float32Array(1024).fill(0); // Create 1024 samples of silence
audioData = new Float32Array(100).fill(0);
} }
// Function to convert Float32Array to Int16Array for WAV format // Function to convert Float32Array to Int16Array for WAV format
function floatTo16BitPCM(output, offset, input) { function floatTo16BitPCM(output, offset, input) {
for (let i = 0; i < input.length; i++, offset += 2) { for (let i = 0; i < input.length; i++, offset += 2) {
// Ensure values are in -1 to 1 range
const s = Math.max(-1, Math.min(1, input[i])); const s = Math.max(-1, Math.min(1, input[i]));
// Convert to 16-bit PCM
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
} }
} }
@@ -558,40 +542,80 @@ function createWavBlob(audioData, sampleRate) {
} }
} }
// Create WAV file with header try {
function encodeWAV(samples) { // Create WAV file with header - careful with buffer sizes
const buffer = new ArrayBuffer(44 + samples.length * 2); const buffer = new ArrayBuffer(44 + audioData.length * 2);
const view = new DataView(buffer); const view = new DataView(buffer);
// RIFF chunk descriptor // RIFF identifier
writeString(view, 0, 'RIFF'); writeString(view, 0, 'RIFF');
view.setUint32(4, 36 + samples.length * 2, true);
// File length (will be filled later)
view.setUint32(4, 36 + audioData.length * 2, true);
// WAVE identifier
writeString(view, 8, 'WAVE'); writeString(view, 8, 'WAVE');
// fmt sub-chunk // fmt chunk identifier
writeString(view, 12, 'fmt '); writeString(view, 12, 'fmt ');
// fmt chunk length
view.setUint32(16, 16, true); view.setUint32(16, 16, true);
view.setUint16(20, 1, true); // PCM format
view.setUint16(22, 1, true); // Mono channel // Sample format (1 is PCM)
view.setUint16(20, 1, true);
// Mono channel
view.setUint16(22, 1, true);
// Sample rate
view.setUint32(24, sampleRate, true); view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true); // Byte rate
view.setUint16(32, 2, true); // Block align
view.setUint16(34, 16, true); // Bits per sample
// data sub-chunk // Byte rate (sample rate * block align)
view.setUint32(28, sampleRate * 2, true);
// Block align (channels * bytes per sample)
view.setUint16(32, 2, true);
// Bits per sample
view.setUint16(34, 16, true);
// data chunk identifier
writeString(view, 36, 'data'); writeString(view, 36, 'data');
view.setUint32(40, samples.length * 2, true);
floatTo16BitPCM(view, 44, samples);
return buffer; // data chunk length
view.setUint32(40, audioData.length * 2, true);
// Write the PCM samples
floatTo16BitPCM(view, 44, audioData);
// Create and return blob
return new Blob([view], { type: 'audio/wav' });
} catch (err) {
console.error('Error in createWavBlob:', err);
// Create a minimal valid WAV file with silence as fallback
const fallbackSamples = new Float32Array(1024).fill(0);
const fallbackBuffer = new ArrayBuffer(44 + fallbackSamples.length * 2);
const fallbackView = new DataView(fallbackBuffer);
writeString(fallbackView, 0, 'RIFF');
fallbackView.setUint32(4, 36 + fallbackSamples.length * 2, true);
writeString(fallbackView, 8, 'WAVE');
writeString(fallbackView, 12, 'fmt ');
fallbackView.setUint32(16, 16, true);
fallbackView.setUint16(20, 1, true);
fallbackView.setUint16(22, 1, true);
fallbackView.setUint32(24, sampleRate, true);
fallbackView.setUint32(28, sampleRate * 2, true);
fallbackView.setUint16(32, 2, true);
fallbackView.setUint16(34, 16, true);
writeString(fallbackView, 36, 'data');
fallbackView.setUint32(40, fallbackSamples.length * 2, true);
floatTo16BitPCM(fallbackView, 44, fallbackSamples);
return new Blob([fallbackView], { type: 'audio/wav' });
} }
// Convert audio data to TypedArray if it's a regular Array
const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
// Create WAV blob
const wavBuffer = encodeWAV(samples);
return new Blob([wavBuffer], { type: 'audio/wav' });
} }
// Draw audio visualizer // Draw audio visualizer