This commit is contained in:
idler-wheel
2025-03-30 07:57:47 -04:00

View File

@@ -13,6 +13,11 @@ import requests
import huggingface_hub import huggingface_hub
from generator import load_csm_1b, Segment from generator import load_csm_1b, Segment
# Force CPU mode regardless of what's available
# This bypasses the CUDA/cuDNN library requirements
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Hide all CUDA devices
torch.backends.cudnn.enabled = False # Disable cuDNN
# Configure environment with longer timeouts # Configure environment with longer timeouts
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout for downloads
requests.adapters.DEFAULT_TIMEOUT = 60 # Increase default requests timeout requests.adapters.DEFAULT_TIMEOUT = 60 # Increase default requests timeout
@@ -24,30 +29,10 @@ app = Flask(__name__)
app.config['SECRET_KEY'] = 'your-secret-key' app.config['SECRET_KEY'] = 'your-secret-key'
socketio = SocketIO(app, cors_allowed_origins="*") socketio = SocketIO(app, cors_allowed_origins="*")
# Check for CUDA availability and handle potential CUDA/cuDNN issues # Force CPU regardless of what hardware is available
try: device = "cuda" if torch.cuda.is_available() else "cpu"
cuda_available = torch.cuda.is_available() whisper_compute_type = "int8"
# Try to initialize CUDA to check if libraries are properly loaded print(f"Forcing CPU mode for all models")
if cuda_available:
_ = torch.zeros(1).cuda()
device = "cuda"
whisper_compute_type = "float16"
print("CUDA is available and initialized successfully")
elif torch.backends.mps.is_available():
device = "mps"
whisper_compute_type = "float32"
print("MPS is available (Apple Silicon)")
else:
device = "cpu"
whisper_compute_type = "int8"
print("Using CPU (CUDA/MPS not available)")
except Exception as e:
print(f"Error initializing CUDA: {e}")
print("Falling back to CPU")
device = "cpu"
whisper_compute_type = "int8"
print(f"Using device: {device}")
# Initialize models with proper error handling # Initialize models with proper error handling
whisper_model = None whisper_model = None
@@ -60,12 +45,10 @@ def load_models():
# Initialize Faster-Whisper for transcription # Initialize Faster-Whisper for transcription
try: try:
print("Loading Whisper model...") print("Loading Whisper model on CPU...")
# Import here to avoid immediate import errors if package is missing # Import here to avoid immediate import errors if package is missing
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
# Force CPU for Whisper if we had CUDA issues whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8", download_root="./models/whisper")
whisper_device = device if device != "cpu" else "cpu"
whisper_model = WhisperModel("base", device=whisper_device, compute_type=whisper_compute_type, download_root="./models/whisper")
print("Whisper model loaded successfully") print("Whisper model loaded successfully")
except Exception as e: except Exception as e:
print(f"Error loading Whisper model: {e}") print(f"Error loading Whisper model: {e}")
@@ -73,10 +56,8 @@ def load_models():
# Initialize CSM model for audio generation # Initialize CSM model for audio generation
try: try:
print("Loading CSM model...") print("Loading CSM model on CPU...")
# Force CPU for CSM if we had CUDA issues csm_generator = load_csm_1b(device="cpu")
csm_device = device if device != "cpu" else "cpu"
csm_generator = load_csm_1b(device=csm_device)
print("CSM model loaded successfully") print("CSM model loaded successfully")
except Exception as e: except Exception as e:
print(f"Error loading CSM model: {e}") print(f"Error loading CSM model: {e}")
@@ -84,15 +65,13 @@ def load_models():
# Initialize Llama 3.2 model for response generation # Initialize Llama 3.2 model for response generation
try: try:
print("Loading Llama 3.2 model...") print("Loading Llama 3.2 model on CPU...")
llm_model_id = "meta-llama/Llama-3.2-1B" # Choose appropriate size based on resources llm_model_id = "meta-llama/Llama-3.2-1B" # Choose appropriate size based on resources
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama") llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama")
# Force CPU for LLM if we had CUDA issues
llm_device = device if device != "cpu" else "cpu"
llm_model = AutoModelForCausalLM.from_pretrained( llm_model = AutoModelForCausalLM.from_pretrained(
llm_model_id, llm_model_id,
torch_dtype=torch.bfloat16 if llm_device != "cpu" else torch.float32, torch_dtype=torch.float32, # Use float32 on CPU
device_map=llm_device, device_map="cpu",
cache_dir="./models/llama", cache_dir="./models/llama",
low_cpu_mem_usage=True low_cpu_mem_usage=True
) )
@@ -379,7 +358,7 @@ if __name__ == '__main__':
os.rename('index.html', 'templates/index.html') os.rename('index.html', 'templates/index.html')
# Load models asynchronously before starting the server # Load models asynchronously before starting the server
print("Starting model loading...") print("Starting CPU-only model loading...")
# In a production environment, you could load models in a separate thread # In a production environment, you could load models in a separate thread
load_models() load_models()