Demo Fixes 13

2025-03-30 08:04:16 -04:00
parent 647a915953
commit a55b3f52a4
1 changed files with 59 additions and 18 deletions
@@ -13,11 +13,6 @@ import requests
 import huggingface_hub
 from generator import load_csm_1b, Segment
 # Force CPU mode regardless of what's available
 # This bypasses the CUDA/cuDNN library requirements
 os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Hide all CUDA devices
 torch.backends.cudnn.enabled = False  # Disable cuDNN
 # Configure environment with longer timeouts
 os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600"  # 10 minutes timeout for downloads
 requests.adapters.DEFAULT_TIMEOUT = 60  # Increase default requests timeout
@@ -29,10 +24,55 @@ app = Flask(__name__)
 app.config['SECRET_KEY'] = 'your-secret-key'
 socketio = SocketIO(app, cors_allowed_origins="*")
-# Force CPU regardless of what hardware is available
+# Explicitly check for CUDA and print more detailed info
-device = "cuda" if torch.cuda.is_available() else "cpu"
+print("\n=== CUDA Information ===")
-whisper_compute_type = "int8"
+if torch.cuda.is_available():
-print(f"Forcing CPU mode for all models")
+    print(f"CUDA is available")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
 else:
    print("CUDA is not available")
 # Check for cuDNN
 try:
    import ctypes
    ctypes.CDLL("libcudnn_ops_infer.so.8")
    print("cuDNN is available")
 except:
    print("cuDNN is not available (libcudnn_ops_infer.so.8 not found)")
 # Check for other compute platforms
 if torch.backends.mps.is_available():
    print("MPS (Apple Silicon) is available")
 else:
    print("MPS is not available")
 print("========================\n")
 # Check for CUDA availability and handle potential CUDA/cuDNN issues
 try:
    if torch.cuda.is_available():
        # Try to initialize CUDA to check if libraries are properly loaded
        _ = torch.zeros(1).cuda()
        device = "cuda"
        whisper_compute_type = "float16"
        print("🟢 CUDA is available and initialized successfully")
    elif torch.backends.mps.is_available():
        device = "mps"
        whisper_compute_type = "float32"
        print("🟢 MPS is available (Apple Silicon)")
    else:
        device = "cpu"
        whisper_compute_type = "int8"
        print("🟡 Using CPU (CUDA/MPS not available)")
 except Exception as e:
    print(f"🔴 Error initializing CUDA: {e}")
    print("🔴 Falling back to CPU")
    device = "cpu"
    whisper_compute_type = "int8"
 print(f"Using device: {device}")
 # Initialize models with proper error handling
 whisper_model = None
@@ -45,10 +85,10 @@ def load_models():
    # Initialize Faster-Whisper for transcription
    try:
-        print("Loading Whisper model on CPU...")
+        print("Loading Whisper model...")
        # Import here to avoid immediate import errors if package is missing
        from faster_whisper import WhisperModel
-        whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8", download_root="./models/whisper")
+        whisper_model = WhisperModel("base", device=device, compute_type=whisper_compute_type, download_root="./models/whisper")
        print("Whisper model loaded successfully")
    except Exception as e:
        print(f"Error loading Whisper model: {e}")
@@ -56,8 +96,8 @@ def load_models():
    # Initialize CSM model for audio generation
    try:
-        print("Loading CSM model on CPU...")
+        print("Loading CSM model...")
-        csm_generator = load_csm_1b(device="cpu")
+        csm_generator = load_csm_1b(device=device)
        print("CSM model loaded successfully")
    except Exception as e:
        print(f"Error loading CSM model: {e}")
@@ -65,13 +105,15 @@ def load_models():
    # Initialize Llama 3.2 model for response generation
    try:
-        print("Loading Llama 3.2 model on CPU...")
+        print("Loading Llama 3.2 model...")
        llm_model_id = "meta-llama/Llama-3.2-1B"  # Choose appropriate size based on resources
        llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_id, cache_dir="./models/llama")
        # Use the right data type based on device
        dtype = torch.bfloat16 if device != "cpu" else torch.float32
        llm_model = AutoModelForCausalLM.from_pretrained(
            llm_model_id,
-            torch_dtype=torch.float32,  # Use float32 on CPU
+            torch_dtype=dtype,
-            device_map="cpu",
+            device_map=device,
            cache_dir="./models/llama",
            low_cpu_mem_usage=True
        )
@@ -358,8 +400,7 @@ if __name__ == '__main__':
        os.rename('index.html', 'templates/index.html')
    # Load models asynchronously before starting the server
-    print("Starting CPU-only model loading...")
+    print("Starting model loading...")
    # In a production environment, you could load models in a separate thread
    load_models()
    # Start the server