UI and Audio Processing Update

2026-01-07 04:09:35 +00:00
parent 864ccabc6e
commit 585830103b
18 changed files with 2069 additions and 481 deletions
--- a/server/processor.py
+++ b/server/processor.py
@@ -1,4 +1,5 @@
 import os
+import time
 import struct
 import math
 import numpy as np
@@ -36,49 +37,90 @@ class AudioImageProcessor:
        return struct.pack(HEADER_FMT, signature, file_size, len(ext_bytes)) + ext_bytes

    # --- Feature 1: Spectrogram Art ---
-    def generate_spectrogram(self, audio_path):
+    def generate_spectrogram(self, audio_path, min_pixels=0):
        """Generates a visual spectrogram from audio."""
+        try:
+            import torch
+            import torchaudio
+            has_torch = True
+        except ImportError:
+            has_torch = False
+
+        if has_torch and torch.cuda.is_available():
+            try:
+                # GPU Accelerated Path
+                device = "cuda"
+                waveform, sr = torchaudio.load(audio_path)
+                waveform = waveform.to(device)
+                
+                # Create transformation
+                # Mimic librosa defaults roughly: n_fft=2048, hop_length=512
+                n_fft = 2048
+                win_length = n_fft
+                hop_length = 512
+                n_mels = 128
+                
+                mel_spectrogram = torchaudio.transforms.MelSpectrogram(
+                    sample_rate=sr,
+                    n_fft=n_fft,
+                    win_length=win_length,
+                    hop_length=hop_length,
+                    n_mels=n_mels,
+                    f_max=8000
+                ).to(device)
+                
+                S = mel_spectrogram(waveform)
+                S_dB = torchaudio.transforms.AmplitudeToDB()(S)
+                
+                # Back to CPU for plotting
+                S_dB = S_dB.cpu().numpy()[0] # Take first channel
+                # Librosa display expects numpy
+            except Exception as e:
+                # Fallback to CPU/Librosa if any error occurs
+                print(f"GPU processing failed, falling back to CPU: {e}")
+                return self._generate_spectrogram_cpu(audio_path, min_pixels)
+        else:
+            return self._generate_spectrogram_cpu(audio_path, min_pixels)
+
+        # Plotting (Common)
+        return self._plot_spectrogram(S_dB, sr, min_pixels)
+
+    def _generate_spectrogram_cpu(self, audio_path, min_pixels=0):
        y, sr = librosa.load(audio_path)
-        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=256, fmax=8000)
+        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
        S_dB = librosa.power_to_db(S, ref=np.max)
-
-        plt.figure(figsize=(12, 6))
-        plt.axis('off')
-        plt.margins(0, 0)
-        plt.gca().xaxis.set_major_locator(plt.NullLocator())
-        plt.gca().yaxis.set_major_locator(plt.NullLocator())
+        return self._plot_spectrogram(S_dB, sr, min_pixels)
        
-        # 'magma' is a nice default, but you could parameterize this
-        librosa.display.specshow(S_dB, sr=sr, fmax=8000, cmap='magma')
+    def _plot_spectrogram(self, S_dB, sr, min_pixels=0):
+        # Calculate DPI dynamically to ensure we have enough pixels for steganography
+        dpi = 300
+        if min_pixels > 0:
+            # Figure is 12x6 inches. Area = 72 sq inches.
+            # Total Pixels = 72 * dpi^2
+            required_dpi = math.ceil((min_pixels / 72) ** 0.5)
+            # Add a small buffer
+            dpi = max(dpi, int(required_dpi * 1.05))

-        output_path = os.path.join(self.upload_folder, f"art_{os.path.basename(audio_path)}.png")
-        plt.savefig(output_path, bbox_inches='tight', pad_inches=0, dpi=300)
+        # Use exact dimensions without margins
+        width_in = 12
+        height_in = 6
+        fig = plt.figure(figsize=(width_in, height_in))
+        
+        # Add axes covering the entire figure [left, bottom, width, height]
+        ax = plt.axes([0, 0, 1, 1], frameon=False)
+        ax.set_axis_off()
+        
+        # 'magma' is a nice default
+        librosa.display.specshow(S_dB, sr=sr, fmax=8000, cmap='magma', ax=ax)
+
+        output_path = os.path.join(self.upload_folder, f"art_{int(time.time())}.png")
+        
+        # specific DPI, no bbox_inches='tight' (which shrinks the image)
+        plt.savefig(output_path, dpi=dpi)
        plt.close()
        return output_path

-    # --- Feature 2: Format Shift (Raw Data to Image) ---
-    def encode_shift(self, file_path):
-        file_data = self._get_bytes(file_path)
-        file_size = len(file_data)
-        
-        header = self._create_header(SIG_SHIFT, file_size, file_path)
-        payload = header + file_data
-        
-        # Calculate size
-        pixels = math.ceil(len(payload) / 3)
-        side = math.ceil(math.sqrt(pixels))
-        padding = (side * side * 3) - len(payload)
-        
-        # Pad and Reshape
-        arr = np.frombuffer(payload, dtype=np.uint8)
-        if padding > 0:
-            arr = np.pad(arr, (0, padding), 'constant')
-            
-        img = Image.fromarray(arr.reshape((side, side, 3)), 'RGB')
-        
-        output_path = os.path.join(self.upload_folder, f"shift_{os.path.basename(file_path)}.png")
-        img.save(output_path, "PNG")
-        return output_path
+

    # --- Feature 3: Steganography (Embed in Host) ---
    def encode_stego(self, data_path, host_path):