Demo Update 3
This commit is contained in:
@@ -16,6 +16,28 @@ import gc
|
|||||||
from collections import deque
|
from collections import deque
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
|
||||||
|
# Add these lines right after your imports
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Handle CUDA issues
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit to first GPU only
|
||||||
|
torch.backends.cudnn.benchmark = True
|
||||||
|
|
||||||
|
# Set CUDA settings to avoid TF32 warnings
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
|
|
||||||
|
# Set compute type based on available hardware
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
device = "cuda"
|
||||||
|
compute_type = "float16" # Faster for CUDA
|
||||||
|
else:
|
||||||
|
device = "cpu"
|
||||||
|
compute_type = "int8" # Better for CPU
|
||||||
|
|
||||||
|
print(f"Using device: {device} with compute type: {compute_type}")
|
||||||
|
|
||||||
# Select device
|
# Select device
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
@@ -28,9 +50,22 @@ generator = load_csm_1b(device=device)
|
|||||||
|
|
||||||
# Initialize WhisperX for ASR
|
# Initialize WhisperX for ASR
|
||||||
print("Loading WhisperX model...")
|
print("Loading WhisperX model...")
|
||||||
# Use a smaller model for faster response times
|
try:
|
||||||
asr_model = whisperx.load_model("medium", device, compute_type="float16")
|
# Try to load a smaller model for faster response times
|
||||||
print("WhisperX model loaded!")
|
asr_model = whisperx.load_model("small", device, compute_type=compute_type)
|
||||||
|
print("WhisperX 'small' model loaded successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading 'small' model: {str(e)}")
|
||||||
|
try:
|
||||||
|
# Fall back to tiny model if small fails
|
||||||
|
asr_model = whisperx.load_model("tiny", device, compute_type=compute_type)
|
||||||
|
print("WhisperX 'tiny' model loaded as fallback")
|
||||||
|
except Exception as e2:
|
||||||
|
print(f"Error loading fallback model: {str(e2)}")
|
||||||
|
print("Trying CPU model as last resort")
|
||||||
|
# Last resort - try CPU
|
||||||
|
asr_model = whisperx.load_model("tiny", "cpu", compute_type="int8")
|
||||||
|
print("WhisperX loaded on CPU as last resort")
|
||||||
|
|
||||||
# Silence detection parameters
|
# Silence detection parameters
|
||||||
SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization
|
SILENCE_THRESHOLD = 0.01 # Adjust based on your audio normalization
|
||||||
@@ -53,76 +88,130 @@ active_clients = {} # Map client_id to client context
|
|||||||
|
|
||||||
# Helper function to convert audio data
|
# Helper function to convert audio data
|
||||||
def decode_audio_data(audio_data: str) -> torch.Tensor:
|
def decode_audio_data(audio_data: str) -> torch.Tensor:
|
||||||
"""Decode base64 audio data to a torch tensor"""
|
"""Decode base64 audio data to a torch tensor with improved error handling"""
|
||||||
try:
|
try:
|
||||||
# Skip empty audio data
|
# Skip empty audio data
|
||||||
if not audio_data:
|
if not audio_data or len(audio_data) < 100:
|
||||||
print("Empty audio data received")
|
print("Empty or too short audio data received")
|
||||||
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
|
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
|
||||||
|
|
||||||
# Extract the actual base64 content
|
# Extract the actual base64 content
|
||||||
if ',' in audio_data:
|
if ',' in audio_data:
|
||||||
|
# Handle data URL format (data:audio/wav;base64,...)
|
||||||
audio_data = audio_data.split(',')[1]
|
audio_data = audio_data.split(',')[1]
|
||||||
|
|
||||||
# Decode base64 audio data
|
# Decode base64 audio data
|
||||||
try:
|
try:
|
||||||
binary_data = base64.b64decode(audio_data)
|
binary_data = base64.b64decode(audio_data)
|
||||||
print(f"Decoded base64 data: {len(binary_data)} bytes")
|
print(f"Decoded base64 data: {len(binary_data)} bytes")
|
||||||
|
|
||||||
|
# Check if we have enough data for a valid WAV
|
||||||
|
if len(binary_data) < 44: # WAV header is 44 bytes
|
||||||
|
print("Data too small to be a valid WAV file")
|
||||||
|
return torch.zeros(generator.sample_rate // 2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Base64 decoding error: {str(e)}")
|
print(f"Base64 decoding error: {str(e)}")
|
||||||
return torch.zeros(generator.sample_rate // 2)
|
return torch.zeros(generator.sample_rate // 2)
|
||||||
|
|
||||||
# Debug: save the raw binary data to examine with external tools
|
# Save for debugging
|
||||||
debug_path = os.path.join(base_dir, "debug_incoming.wav")
|
debug_path = os.path.join(base_dir, "debug_incoming.wav")
|
||||||
with open(debug_path, 'wb') as f:
|
with open(debug_path, 'wb') as f:
|
||||||
f.write(binary_data)
|
f.write(binary_data)
|
||||||
print(f"Saved debug file to {debug_path}")
|
print(f"Saved debug file: {debug_path}")
|
||||||
|
|
||||||
# Load audio from binary data
|
# Approach 1: Load directly with torchaudio
|
||||||
try:
|
try:
|
||||||
with BytesIO(binary_data) as temp_file:
|
with BytesIO(binary_data) as temp_file:
|
||||||
|
temp_file.seek(0) # Ensure we're at the start of the buffer
|
||||||
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
|
audio_tensor, sample_rate = torchaudio.load(temp_file, format="wav")
|
||||||
print(f"Loaded audio: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
|
print(f"Direct loading success: shape={audio_tensor.shape}, rate={sample_rate}Hz")
|
||||||
|
|
||||||
# Check if audio is valid
|
# Check if audio is valid
|
||||||
if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
|
if audio_tensor.numel() == 0 or torch.isnan(audio_tensor).any():
|
||||||
print("Warning: Empty or invalid audio data detected")
|
raise ValueError("Empty or invalid audio tensor detected")
|
||||||
return torch.zeros(generator.sample_rate // 2)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Audio loading error: {str(e)}")
|
print(f"Direct loading failed: {str(e)}")
|
||||||
# Try saving to a temporary file instead of loading from BytesIO
|
|
||||||
|
# Approach 2: Try to fix/normalize the WAV data
|
||||||
try:
|
try:
|
||||||
temp_path = os.path.join(base_dir, "temp_incoming.wav")
|
# Sometimes WAV headers can be malformed, attempt to fix
|
||||||
|
temp_path = os.path.join(base_dir, "temp_fixing.wav")
|
||||||
with open(temp_path, 'wb') as f:
|
with open(temp_path, 'wb') as f:
|
||||||
f.write(binary_data)
|
f.write(binary_data)
|
||||||
print(f"Trying to load from file: {temp_path}")
|
|
||||||
|
# Use a simpler numpy approach as backup
|
||||||
|
import numpy as np
|
||||||
|
import wave
|
||||||
|
|
||||||
|
try:
|
||||||
|
with wave.open(temp_path, 'rb') as wf:
|
||||||
|
n_channels = wf.getnchannels()
|
||||||
|
sample_width = wf.getsampwidth()
|
||||||
|
sample_rate = wf.getframerate()
|
||||||
|
n_frames = wf.getnframes()
|
||||||
|
|
||||||
|
# Read the frames
|
||||||
|
frames = wf.readframes(n_frames)
|
||||||
|
print(f"Wave reading: channels={n_channels}, rate={sample_rate}Hz, frames={n_frames}")
|
||||||
|
|
||||||
|
# Convert to numpy and then to torch
|
||||||
|
if sample_width == 2: # 16-bit audio
|
||||||
|
data = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
elif sample_width == 1: # 8-bit audio
|
||||||
|
data = np.frombuffer(frames, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported sample width: {sample_width}")
|
||||||
|
|
||||||
|
# Convert to mono if needed
|
||||||
|
if n_channels > 1:
|
||||||
|
data = data.reshape(-1, n_channels)
|
||||||
|
data = data.mean(axis=1)
|
||||||
|
|
||||||
|
# Convert to torch tensor
|
||||||
|
audio_tensor = torch.from_numpy(data)
|
||||||
|
print(f"Successfully converted with numpy: shape={audio_tensor.shape}")
|
||||||
|
except Exception as wave_error:
|
||||||
|
print(f"Wave processing failed: {str(wave_error)}")
|
||||||
|
# Try with torchaudio as last resort
|
||||||
audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
|
audio_tensor, sample_rate = torchaudio.load(temp_path, format="wav")
|
||||||
print(f"Loaded from file: shape={audio_tensor.shape}, sample_rate={sample_rate}Hz")
|
|
||||||
|
# Clean up
|
||||||
|
if os.path.exists(temp_path):
|
||||||
os.remove(temp_path)
|
os.remove(temp_path)
|
||||||
except Exception as e2:
|
except Exception as e2:
|
||||||
print(f"Secondary audio loading error: {str(e2)}")
|
print(f"All WAV loading methods failed: {str(e2)}")
|
||||||
|
print("Returning silence as fallback")
|
||||||
return torch.zeros(generator.sample_rate // 2)
|
return torch.zeros(generator.sample_rate // 2)
|
||||||
|
|
||||||
|
# Ensure audio is the right shape (mono)
|
||||||
|
if len(audio_tensor.shape) > 1 and audio_tensor.shape[0] > 1:
|
||||||
|
audio_tensor = torch.mean(audio_tensor, dim=0)
|
||||||
|
|
||||||
|
# Ensure we have a 1D tensor
|
||||||
|
audio_tensor = audio_tensor.squeeze()
|
||||||
|
|
||||||
# Resample if needed
|
# Resample if needed
|
||||||
if sample_rate != generator.sample_rate:
|
if sample_rate != generator.sample_rate:
|
||||||
try:
|
try:
|
||||||
print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
|
print(f"Resampling from {sample_rate}Hz to {generator.sample_rate}Hz")
|
||||||
audio_tensor = torchaudio.functional.resample(
|
resampler = torchaudio.transforms.Resample(
|
||||||
audio_tensor.squeeze(0),
|
|
||||||
orig_freq=sample_rate,
|
orig_freq=sample_rate,
|
||||||
new_freq=generator.sample_rate
|
new_freq=generator.sample_rate
|
||||||
)
|
)
|
||||||
print(f"Resampled audio shape: {audio_tensor.shape}")
|
audio_tensor = resampler(audio_tensor)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Resampling error: {str(e)}")
|
print(f"Resampling error: {str(e)}")
|
||||||
return torch.zeros(generator.sample_rate // 2)
|
# If resampling fails, just return the original audio
|
||||||
else:
|
# The model can often handle different sample rates
|
||||||
audio_tensor = audio_tensor.squeeze(0)
|
|
||||||
|
|
||||||
print(f"Final audio tensor shape: {audio_tensor.shape}")
|
# Normalize audio to avoid issues
|
||||||
|
if torch.abs(audio_tensor).max() > 0:
|
||||||
|
audio_tensor = audio_tensor / torch.abs(audio_tensor).max()
|
||||||
|
|
||||||
|
print(f"Final audio tensor: shape={audio_tensor.shape}, min={audio_tensor.min().item():.4f}, max={audio_tensor.max().item():.4f}")
|
||||||
return audio_tensor
|
return audio_tensor
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error decoding audio: {str(e)}")
|
print(f"Unhandled error in decode_audio_data: {str(e)}")
|
||||||
# Return a small silent audio segment as fallback
|
# Return a small silent audio segment as fallback
|
||||||
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
|
return torch.zeros(generator.sample_rate // 2) # 0.5 seconds of silence
|
||||||
|
|
||||||
@@ -143,6 +232,8 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
|
|||||||
temp_path = os.path.join(base_dir, "temp_audio.wav")
|
temp_path = os.path.join(base_dir, "temp_audio.wav")
|
||||||
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
|
torchaudio.save(temp_path, audio_tensor.unsqueeze(0).cpu(), generator.sample_rate)
|
||||||
|
|
||||||
|
print(f"Transcribing audio file: {temp_path} (size: {os.path.getsize(temp_path)} bytes)")
|
||||||
|
|
||||||
# Load and transcribe the audio
|
# Load and transcribe the audio
|
||||||
audio = whisperx.load_audio(temp_path)
|
audio = whisperx.load_audio(temp_path)
|
||||||
result = asr_model.transcribe(audio, batch_size=16)
|
result = asr_model.transcribe(audio, batch_size=16)
|
||||||
@@ -155,11 +246,15 @@ def transcribe_audio(audio_tensor: torch.Tensor) -> str:
|
|||||||
if result["segments"] and len(result["segments"]) > 0:
|
if result["segments"] and len(result["segments"]) > 0:
|
||||||
# Combine all segments
|
# Combine all segments
|
||||||
transcription = " ".join([segment["text"] for segment in result["segments"]])
|
transcription = " ".join([segment["text"] for segment in result["segments"]])
|
||||||
|
print(f"Transcription successful: '{transcription.strip()}'")
|
||||||
return transcription.strip()
|
return transcription.strip()
|
||||||
else:
|
else:
|
||||||
|
print("Transcription returned no segments")
|
||||||
return ""
|
return ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in transcription: {str(e)}")
|
print(f"Error in transcription: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
if os.path.exists("temp_audio.wav"):
|
if os.path.exists("temp_audio.wav"):
|
||||||
os.remove("temp_audio.wav")
|
os.remove("temp_audio.wav")
|
||||||
return ""
|
return ""
|
||||||
@@ -385,24 +480,45 @@ def handle_stream_audio(data):
|
|||||||
# Log the transcription
|
# Log the transcription
|
||||||
print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
|
print(f"[{client_id}] Transcribed text: '{transcribed_text}'")
|
||||||
|
|
||||||
# Add to conversation context
|
# Handle the transcription result
|
||||||
if transcribed_text:
|
if transcribed_text:
|
||||||
|
# Add user message to context
|
||||||
user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
|
user_segment = Segment(text=transcribed_text, speaker=speaker_id, audio=full_audio)
|
||||||
client['context_segments'].append(user_segment)
|
client['context_segments'].append(user_segment)
|
||||||
|
|
||||||
# Generate a contextual response
|
|
||||||
response_text = generate_response(transcribed_text, client['context_segments'])
|
|
||||||
|
|
||||||
# Send the transcribed text to client
|
# Send the transcribed text to client
|
||||||
emit('transcription', {
|
emit('transcription', {
|
||||||
'type': 'transcription',
|
'type': 'transcription',
|
||||||
'text': transcribed_text
|
'text': transcribed_text
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Generate a contextual response
|
||||||
|
response_text = generate_response(transcribed_text, client['context_segments'])
|
||||||
|
print(f"[{client_id}] Generating audio response: '{response_text}'")
|
||||||
|
|
||||||
|
# Let the client know we're processing
|
||||||
|
emit('processing_status', {
|
||||||
|
'type': 'processing_status',
|
||||||
|
'status': 'generating_audio',
|
||||||
|
'message': 'Generating audio response...'
|
||||||
|
})
|
||||||
|
|
||||||
# Generate audio for the response
|
# Generate audio for the response
|
||||||
|
try:
|
||||||
|
# Use a different speaker than the user
|
||||||
|
ai_speaker_id = 1 if speaker_id == 0 else 0
|
||||||
|
|
||||||
|
# Start audio generation with streaming (chunk by chunk)
|
||||||
|
audio_chunks = []
|
||||||
|
|
||||||
|
# This version tries to stream the audio generation in smaller chunks
|
||||||
|
# Note: CSM model doesn't natively support incremental generation,
|
||||||
|
# so we're simulating it here for a more responsive UI experience
|
||||||
|
|
||||||
|
# Generate the full response
|
||||||
audio_tensor = generator.generate(
|
audio_tensor = generator.generate(
|
||||||
text=response_text,
|
text=response_text,
|
||||||
speaker=1 if speaker_id == 0 else 0, # Use opposite speaker
|
speaker=ai_speaker_id,
|
||||||
context=client['context_segments'],
|
context=client['context_segments'],
|
||||||
max_audio_length_ms=10_000,
|
max_audio_length_ms=10_000,
|
||||||
)
|
)
|
||||||
@@ -410,7 +526,7 @@ def handle_stream_audio(data):
|
|||||||
# Add response to context
|
# Add response to context
|
||||||
ai_segment = Segment(
|
ai_segment = Segment(
|
||||||
text=response_text,
|
text=response_text,
|
||||||
speaker=1 if speaker_id == 0 else 0,
|
speaker=ai_speaker_id,
|
||||||
audio=audio_tensor
|
audio=audio_tensor
|
||||||
)
|
)
|
||||||
client['context_segments'].append(ai_segment)
|
client['context_segments'].append(ai_segment)
|
||||||
@@ -422,6 +538,15 @@ def handle_stream_audio(data):
|
|||||||
'text': response_text,
|
'text': response_text,
|
||||||
'audio': audio_base64
|
'audio': audio_base64
|
||||||
})
|
})
|
||||||
|
|
||||||
|
print(f"[{client_id}] Audio response sent: {len(audio_base64)} bytes")
|
||||||
|
|
||||||
|
except Exception as gen_error:
|
||||||
|
print(f"Error generating audio response: {str(gen_error)}")
|
||||||
|
emit('error', {
|
||||||
|
'type': 'error',
|
||||||
|
'message': "Sorry, there was an error generating the audio response."
|
||||||
|
})
|
||||||
else:
|
else:
|
||||||
# If transcription failed, send a generic response
|
# If transcription failed, send a generic response
|
||||||
emit('error', {
|
emit('error', {
|
||||||
@@ -437,6 +562,7 @@ def handle_stream_audio(data):
|
|||||||
|
|
||||||
# If buffer gets too large without silence, process it anyway
|
# If buffer gets too large without silence, process it anyway
|
||||||
elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec
|
elif len(client['streaming_buffer']) >= 30: # ~6 seconds of audio at 5 chunks/sec
|
||||||
|
print(f"[{client_id}] Processing long audio segment without silence")
|
||||||
full_audio = torch.cat(client['streaming_buffer'], dim=0)
|
full_audio = torch.cat(client['streaming_buffer'], dim=0)
|
||||||
|
|
||||||
# Process with WhisperX speech-to-text
|
# Process with WhisperX speech-to-text
|
||||||
@@ -453,7 +579,9 @@ def handle_stream_audio(data):
|
|||||||
'text': transcribed_text + " (processing continued speech...)"
|
'text': transcribed_text + " (processing continued speech...)"
|
||||||
})
|
})
|
||||||
|
|
||||||
client['streaming_buffer'] = []
|
# Keep half of the buffer for context (sliding window approach)
|
||||||
|
half_point = len(client['streaming_buffer']) // 2
|
||||||
|
client['streaming_buffer'] = client['streaming_buffer'][half_point:]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback
|
import traceback
|
||||||
@@ -497,6 +625,62 @@ def handle_stop_streaming(data):
|
|||||||
'status': 'stopped'
|
'status': 'stopped'
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def stream_audio_to_client(client_id, audio_tensor, text, speaker_id, chunk_size_ms=500):
|
||||||
|
"""Stream audio to client in chunks to simulate real-time generation"""
|
||||||
|
try:
|
||||||
|
if client_id not in active_clients:
|
||||||
|
print(f"Client {client_id} not found for streaming")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Calculate chunk size in samples
|
||||||
|
chunk_size = int(generator.sample_rate * chunk_size_ms / 1000)
|
||||||
|
total_chunks = math.ceil(audio_tensor.size(0) / chunk_size)
|
||||||
|
|
||||||
|
print(f"Streaming audio in {total_chunks} chunks of {chunk_size_ms}ms each")
|
||||||
|
|
||||||
|
# Send initial response with text but no audio yet
|
||||||
|
socketio.emit('audio_response_start', {
|
||||||
|
'type': 'audio_response_start',
|
||||||
|
'text': text,
|
||||||
|
'total_chunks': total_chunks
|
||||||
|
}, room=client_id)
|
||||||
|
|
||||||
|
# Stream each chunk
|
||||||
|
for i in range(total_chunks):
|
||||||
|
start_idx = i * chunk_size
|
||||||
|
end_idx = min(start_idx + chunk_size, audio_tensor.size(0))
|
||||||
|
|
||||||
|
# Extract chunk
|
||||||
|
chunk = audio_tensor[start_idx:end_idx]
|
||||||
|
|
||||||
|
# Encode chunk
|
||||||
|
chunk_base64 = encode_audio_data(chunk)
|
||||||
|
|
||||||
|
# Send chunk
|
||||||
|
socketio.emit('audio_response_chunk', {
|
||||||
|
'type': 'audio_response_chunk',
|
||||||
|
'chunk_index': i,
|
||||||
|
'total_chunks': total_chunks,
|
||||||
|
'audio': chunk_base64,
|
||||||
|
'is_last': i == total_chunks - 1
|
||||||
|
}, room=client_id)
|
||||||
|
|
||||||
|
# Brief pause between chunks to simulate streaming
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
# Send completion message
|
||||||
|
socketio.emit('audio_response_complete', {
|
||||||
|
'type': 'audio_response_complete',
|
||||||
|
'text': text
|
||||||
|
}, room=client_id)
|
||||||
|
|
||||||
|
print(f"Audio streaming complete: {total_chunks} chunks sent")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error streaming audio to client: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(f"\n{'='*60}")
|
print(f"\n{'='*60}")
|
||||||
print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
|
print(f"🔊 Sesame AI Voice Chat Server (Flask Implementation)")
|
||||||
|
|||||||
@@ -466,37 +466,27 @@ function sendAudioChunk(audioData, speaker) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`Creating WAV from audio data: length=${audioData.length}`);
|
console.log(`Preparing audio chunk: length=${audioData.length}, speaker=${speaker}`);
|
||||||
|
|
||||||
// Check for NaN or invalid values
|
// Check for NaN or invalid values
|
||||||
let hasNaN = false;
|
let hasInvalidValues = false;
|
||||||
let min = Infinity;
|
|
||||||
let max = -Infinity;
|
|
||||||
let sum = 0;
|
|
||||||
|
|
||||||
for (let i = 0; i < audioData.length; i++) {
|
for (let i = 0; i < audioData.length; i++) {
|
||||||
if (isNaN(audioData[i]) || !isFinite(audioData[i])) {
|
if (isNaN(audioData[i]) || !isFinite(audioData[i])) {
|
||||||
hasNaN = true;
|
hasInvalidValues = true;
|
||||||
console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`);
|
console.warn(`Invalid audio value at index ${i}: ${audioData[i]}`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
min = Math.min(min, audioData[i]);
|
|
||||||
max = Math.max(max, audioData[i]);
|
|
||||||
sum += audioData[i];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasNaN) {
|
if (hasInvalidValues) {
|
||||||
console.warn('Audio data contains NaN or Infinity values. Creating silent audio instead.');
|
console.warn('Audio data contains invalid values. Creating silent audio.');
|
||||||
audioData = new Float32Array(audioData.length).fill(0);
|
audioData = new Float32Array(audioData.length).fill(0);
|
||||||
} else {
|
|
||||||
const avg = sum / audioData.length;
|
|
||||||
console.log(`Audio stats: min=${min.toFixed(4)}, max=${max.toFixed(4)}, avg=${avg.toFixed(4)}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Create WAV blob with proper format
|
// Create WAV blob
|
||||||
const wavData = createWavBlob(audioData, 24000);
|
const wavData = createWavBlob(audioData, 24000);
|
||||||
console.log(`WAV blob created: size=${wavData.size} bytes, type=${wavData.type}`);
|
console.log(`WAV blob created: ${wavData.size} bytes`);
|
||||||
|
|
||||||
const reader = new FileReader();
|
const reader = new FileReader();
|
||||||
|
|
||||||
@@ -504,28 +494,21 @@ function sendAudioChunk(audioData, speaker) {
|
|||||||
try {
|
try {
|
||||||
// Get base64 data
|
// Get base64 data
|
||||||
const base64data = reader.result;
|
const base64data = reader.result;
|
||||||
console.log(`Base64 data created: length=${base64data.length}`);
|
console.log(`Base64 data created: ${base64data.length} bytes`);
|
||||||
|
|
||||||
// Validate the base64 data before sending
|
// Send to server
|
||||||
if (!base64data || base64data.length < 100) {
|
|
||||||
console.warn('Generated base64 data is too small or invalid');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Send the audio chunk to the server
|
|
||||||
console.log('Sending audio data to server...');
|
|
||||||
state.socket.emit('stream_audio', {
|
state.socket.emit('stream_audio', {
|
||||||
audio: base64data,
|
audio: base64data,
|
||||||
speaker: speaker
|
speaker: speaker
|
||||||
});
|
});
|
||||||
console.log('Audio data sent successfully');
|
console.log('Audio chunk sent to server');
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error('Error preparing audio data:', err);
|
console.error('Error preparing audio data:', err);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
reader.onerror = function(err) {
|
reader.onerror = function() {
|
||||||
console.error('Error reading audio data:', err);
|
console.error('Error reading audio data as base64');
|
||||||
};
|
};
|
||||||
|
|
||||||
reader.readAsDataURL(wavData);
|
reader.readAsDataURL(wavData);
|
||||||
@@ -534,19 +517,20 @@ function sendAudioChunk(audioData, speaker) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create WAV blob from audio data with validation
|
// Create WAV blob from audio data with improved error handling
|
||||||
function createWavBlob(audioData, sampleRate) {
|
function createWavBlob(audioData, sampleRate) {
|
||||||
// Check if audio data is valid
|
// Validate input
|
||||||
if (!audioData || audioData.length === 0) {
|
if (!audioData || audioData.length === 0) {
|
||||||
console.warn('Empty audio data received');
|
console.warn('Empty audio data provided to createWavBlob');
|
||||||
// Return a tiny silent audio snippet instead
|
audioData = new Float32Array(1024).fill(0); // Create 1024 samples of silence
|
||||||
audioData = new Float32Array(100).fill(0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to convert Float32Array to Int16Array for WAV format
|
// Function to convert Float32Array to Int16Array for WAV format
|
||||||
function floatTo16BitPCM(output, offset, input) {
|
function floatTo16BitPCM(output, offset, input) {
|
||||||
for (let i = 0; i < input.length; i++, offset += 2) {
|
for (let i = 0; i < input.length; i++, offset += 2) {
|
||||||
|
// Ensure values are in -1 to 1 range
|
||||||
const s = Math.max(-1, Math.min(1, input[i]));
|
const s = Math.max(-1, Math.min(1, input[i]));
|
||||||
|
// Convert to 16-bit PCM
|
||||||
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -558,40 +542,80 @@ function createWavBlob(audioData, sampleRate) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create WAV file with header
|
try {
|
||||||
function encodeWAV(samples) {
|
// Create WAV file with header - careful with buffer sizes
|
||||||
const buffer = new ArrayBuffer(44 + samples.length * 2);
|
const buffer = new ArrayBuffer(44 + audioData.length * 2);
|
||||||
const view = new DataView(buffer);
|
const view = new DataView(buffer);
|
||||||
|
|
||||||
// RIFF chunk descriptor
|
// RIFF identifier
|
||||||
writeString(view, 0, 'RIFF');
|
writeString(view, 0, 'RIFF');
|
||||||
view.setUint32(4, 36 + samples.length * 2, true);
|
|
||||||
|
// File length (will be filled later)
|
||||||
|
view.setUint32(4, 36 + audioData.length * 2, true);
|
||||||
|
|
||||||
|
// WAVE identifier
|
||||||
writeString(view, 8, 'WAVE');
|
writeString(view, 8, 'WAVE');
|
||||||
|
|
||||||
// fmt sub-chunk
|
// fmt chunk identifier
|
||||||
writeString(view, 12, 'fmt ');
|
writeString(view, 12, 'fmt ');
|
||||||
|
|
||||||
|
// fmt chunk length
|
||||||
view.setUint32(16, 16, true);
|
view.setUint32(16, 16, true);
|
||||||
view.setUint16(20, 1, true); // PCM format
|
|
||||||
view.setUint16(22, 1, true); // Mono channel
|
// Sample format (1 is PCM)
|
||||||
|
view.setUint16(20, 1, true);
|
||||||
|
|
||||||
|
// Mono channel
|
||||||
|
view.setUint16(22, 1, true);
|
||||||
|
|
||||||
|
// Sample rate
|
||||||
view.setUint32(24, sampleRate, true);
|
view.setUint32(24, sampleRate, true);
|
||||||
view.setUint32(28, sampleRate * 2, true); // Byte rate
|
|
||||||
view.setUint16(32, 2, true); // Block align
|
|
||||||
view.setUint16(34, 16, true); // Bits per sample
|
|
||||||
|
|
||||||
// data sub-chunk
|
// Byte rate (sample rate * block align)
|
||||||
|
view.setUint32(28, sampleRate * 2, true);
|
||||||
|
|
||||||
|
// Block align (channels * bytes per sample)
|
||||||
|
view.setUint16(32, 2, true);
|
||||||
|
|
||||||
|
// Bits per sample
|
||||||
|
view.setUint16(34, 16, true);
|
||||||
|
|
||||||
|
// data chunk identifier
|
||||||
writeString(view, 36, 'data');
|
writeString(view, 36, 'data');
|
||||||
view.setUint32(40, samples.length * 2, true);
|
|
||||||
floatTo16BitPCM(view, 44, samples);
|
|
||||||
|
|
||||||
return buffer;
|
// data chunk length
|
||||||
|
view.setUint32(40, audioData.length * 2, true);
|
||||||
|
|
||||||
|
// Write the PCM samples
|
||||||
|
floatTo16BitPCM(view, 44, audioData);
|
||||||
|
|
||||||
|
// Create and return blob
|
||||||
|
return new Blob([view], { type: 'audio/wav' });
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Error in createWavBlob:', err);
|
||||||
|
|
||||||
|
// Create a minimal valid WAV file with silence as fallback
|
||||||
|
const fallbackSamples = new Float32Array(1024).fill(0);
|
||||||
|
const fallbackBuffer = new ArrayBuffer(44 + fallbackSamples.length * 2);
|
||||||
|
const fallbackView = new DataView(fallbackBuffer);
|
||||||
|
|
||||||
|
writeString(fallbackView, 0, 'RIFF');
|
||||||
|
fallbackView.setUint32(4, 36 + fallbackSamples.length * 2, true);
|
||||||
|
writeString(fallbackView, 8, 'WAVE');
|
||||||
|
writeString(fallbackView, 12, 'fmt ');
|
||||||
|
fallbackView.setUint32(16, 16, true);
|
||||||
|
fallbackView.setUint16(20, 1, true);
|
||||||
|
fallbackView.setUint16(22, 1, true);
|
||||||
|
fallbackView.setUint32(24, sampleRate, true);
|
||||||
|
fallbackView.setUint32(28, sampleRate * 2, true);
|
||||||
|
fallbackView.setUint16(32, 2, true);
|
||||||
|
fallbackView.setUint16(34, 16, true);
|
||||||
|
writeString(fallbackView, 36, 'data');
|
||||||
|
fallbackView.setUint32(40, fallbackSamples.length * 2, true);
|
||||||
|
floatTo16BitPCM(fallbackView, 44, fallbackSamples);
|
||||||
|
|
||||||
|
return new Blob([fallbackView], { type: 'audio/wav' });
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert audio data to TypedArray if it's a regular Array
|
|
||||||
const samples = Array.isArray(audioData) ? new Float32Array(audioData) : audioData;
|
|
||||||
|
|
||||||
// Create WAV blob
|
|
||||||
const wavBuffer = encodeWAV(samples);
|
|
||||||
return new Blob([wavBuffer], { type: 'audio/wav' });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Draw audio visualizer
|
// Draw audio visualizer
|
||||||
|
|||||||
Reference in New Issue
Block a user