Complete Refactor

This commit is contained in:
2025-03-30 01:28:07 -04:00
parent 158afc78c7
commit 46be33b10a
28 changed files with 723 additions and 3081 deletions

View File

@@ -0,0 +1,28 @@
from scipy.io import wavfile
import numpy as np
import torchaudio
def load_audio(file_path):
sample_rate, audio_data = wavfile.read(file_path)
return sample_rate, audio_data
def normalize_audio(audio_data):
audio_data = audio_data.astype(np.float32)
max_val = np.max(np.abs(audio_data))
if max_val > 0:
audio_data /= max_val
return audio_data
def reduce_noise(audio_data, noise_factor=0.1):
noise = np.random.randn(len(audio_data))
noisy_audio = audio_data + noise_factor * noise
return noisy_audio
def save_audio(file_path, sample_rate, audio_data):
torchaudio.save(file_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
def process_audio(file_path, output_path):
sample_rate, audio_data = load_audio(file_path)
normalized_audio = normalize_audio(audio_data)
denoised_audio = reduce_noise(normalized_audio)
save_audio(output_path, sample_rate, denoised_audio)

View File

@@ -0,0 +1,35 @@
from flask import Blueprint, request
from flask_socketio import SocketIO, emit
from src.audio.processor import process_audio
from src.services.transcription_service import TranscriptionService
from src.services.tts_service import TextToSpeechService
streaming_bp = Blueprint('streaming', __name__)
socketio = SocketIO()
transcription_service = TranscriptionService()
tts_service = TextToSpeechService()
@socketio.on('audio_stream')
def handle_audio_stream(data):
audio_chunk = data['audio']
speaker_id = data['speaker']
# Process the audio chunk
processed_audio = process_audio(audio_chunk)
# Transcribe the audio to text
transcription = transcription_service.transcribe(processed_audio)
# Generate a response using the LLM
response_text = generate_response(transcription, speaker_id)
# Convert the response text back to audio
response_audio = tts_service.convert_text_to_speech(response_text, speaker_id)
# Emit the response audio back to the client
emit('audio_response', {'audio': response_audio})
def generate_response(transcription, speaker_id):
# Placeholder for the actual response generation logic
return f"Response to: {transcription}"