53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
import os
|
|
import logging
|
|
import torch
|
|
import eventlet
|
|
import base64
|
|
import tempfile
|
|
from io import BytesIO
|
|
from flask import Flask, render_template, request, jsonify
|
|
from flask_socketio import SocketIO, emit
|
|
import whisper
|
|
import torchaudio
|
|
from src.models.conversation import Segment
|
|
from src.services.tts_service import load_csm_1b
|
|
from src.llm.generator import generate_llm_response
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
from src.audio.streaming import AudioStreamer
|
|
from src.services.transcription_service import TranscriptionService
|
|
from src.services.tts_service import TextToSpeechService
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = Flask(__name__, static_folder='static', template_folder='templates')
|
|
app.config['SECRET_KEY'] = os.getenv('SECRET_KEY', 'your-secret-key')
|
|
socketio = SocketIO(app)
|
|
|
|
# Initialize services
|
|
transcription_service = TranscriptionService()
|
|
tts_service = TextToSpeechService()
|
|
audio_streamer = AudioStreamer()
|
|
|
|
@socketio.on('audio_input')
|
|
def handle_audio_input(data):
|
|
audio_chunk = data['audio']
|
|
speaker_id = data['speaker']
|
|
|
|
# Process audio and convert to text
|
|
text = transcription_service.transcribe(audio_chunk)
|
|
logging.info(f"Transcribed text: {text}")
|
|
|
|
# Generate response using Llama 3.2
|
|
response_text = tts_service.generate_response(text, speaker_id)
|
|
logging.info(f"Generated response: {response_text}")
|
|
|
|
# Convert response text to audio
|
|
audio_response = tts_service.text_to_speech(response_text, speaker_id)
|
|
|
|
# Stream audio response back to client
|
|
socketio.emit('audio_response', {'audio': audio_response})
|
|
|
|
if __name__ == '__main__':
|
|
socketio.run(app, host='0.0.0.0', port=5000) |