Spaces:

michon
/

mrrrme-emotion-ai

Sleeping

App Files Files Community

MichonGoddijn231849 commited on Jan 7

Commit

1af2a91

1 Parent(s): defeda7

fix downloading emotion2vec3

Browse files

Files changed (2) hide show

Dockerfile +13 -38
mrrrme/audio/voice_emotion.py +113 -164

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
 # Hugging Face Spaces - MrrrMe with Coqui XTTS v2 + MODULAR BACKEND
-# FIXED v2.2: emotion2vec downloads WITHOUT loading into RAM (prevents OOM)
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
 # Install system dependencies
@@ -36,11 +36,14 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 &
 WORKDIR /app
 # ============================================================
-# MODEL CACHING - Use persistent storage for large models
 # ============================================================
-# HuggingFace Spaces persistent storage: /data (survives rebuilds!)
 ENV MODELSCOPE_CACHE=/data/modelscope
 ENV MS_CACHE_HOME=/data/modelscope
 # Install PyTorch with CUDA 11.8
 RUN python3.11 -m pip install --no-cache-dir \
@@ -56,43 +59,15 @@ COPY requirements_docker.txt ./
 RUN python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
 # ============================================================
-# PRE-DOWNLOAD MODELS DURING BUILD (for fast startup!)
 # ============================================================
-# PRE-DOWNLOAD XTTS V2 MODEL (Memory Safe)
-# Uses ModelManager to download without loading to RAM (Fixes Exit 137)
-RUN python3.11 -c "from TTS.utils.manage import ModelManager; print('⏳ Downloading XTTS v2 model...'); ModelManager().download_model('tts_models/multilingual/multi-dataset/xtts_v2'); print('✅ XTTS v2 downloaded.')"
 # ============================================================
-# FIXED v2.2: Download emotion2vec WITHOUT loading into memory
 # ============================================================
-# Problem: AutoModel() loads 1.8GB into RAM during build → OOM crash (exit 137)
-# Solution: Use ModelScope SDK to download files only, no model instantiation
-#
-# Strategy:
-# 1. Download model files to /data/modelscope (persistent storage)
-# 2. Model will be loaded at RUNTIME (when container has more memory)
-# 3. Subsequent rebuilds skip download (files already in /data)
-RUN python3.11 -c "\
-import os; \
-os.makedirs('/data/modelscope', exist_ok=True); \
-os.environ['MODELSCOPE_CACHE'] = '/data/modelscope'; \
-os.environ['MS_CACHE_HOME'] = '/data/modelscope'; \
-print('⏳ Downloading emotion2vec+ Large files (1.8GB)...'); \
-print('💡 Files only - model will load at runtime to avoid OOM'); \
-from modelscope.hub.snapshot_download import snapshot_download; \
-model_dir = snapshot_download('iic/emotion2vec_plus_large', cache_dir='/data/modelscope'); \
-print(f'✅ Downloaded to: {model_dir}'); \
-import glob; \
-files = glob.glob(model_dir + '/**/*', recursive=True); \
-print(f'📦 Downloaded {len(files)} files')"
-# Also create fallback cache for local dev
-RUN mkdir -p /home/user/.cache/modelscope && \
-    cp -r /data/modelscope/* /home/user/.cache/modelscope/ 2>/dev/null || true && \
-    chown -R 1000:1000 /home/user && \
-    echo "✅ Model cached in /data/modelscope (PERSISTENT)"
 # Install avatar dependencies
 RUN python3.11 -m pip install --no-cache-dir \
@@ -134,8 +109,8 @@ RUN mkdir -p /etc/nginx/certs && \
     -days 365 \
     -subj "/CN=mrrrme.hf.space"
-# ✅ Create startup script with PERSISTENT CACHE
-RUN printf '#!/bin/bash\nset -e\nexport HOME=/home/user\nmkdir -p /tmp\n\n# Agree to TOS\nexport COQUI_TOS_AGREED=1\n\n# ============================================================\n# emotion2vec cache - Use /data if available (persistent!)\n# ============================================================\nif [ -d "/data/modelscope" ] && [ -n "$(ls -A /data/modelscope 2>/dev/null)" ]; then\n    echo "✅ Using persistent storage: /data/modelscope"\n    export MODELSCOPE_CACHE=/data/modelscope\n    export MS_CACHE_HOME=/data/modelscope\n    du -sh /data/modelscope 2>/dev/null || echo "Calculating size..."\nelse\n    echo "⚠️ Persistent storage not available, using /home/user/.cache/modelscope"\n    export MODELSCOPE_CACHE=/home/user/.cache/modelscope\n    export MS_CACHE_HOME=/home/user/.cache/modelscope\nfi\n\necho "📍 Model cache: $MODELSCOPE_CACHE"\n\npkill -f "backend_new.py" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\n\nsleep 2\necho "🚀 Starting MrrrMe (XTTS v2 + emotion2vec + Modular Backend v2.2)..."\necho "📦 emotion2vec will load from cache (NO download)"\n\n# Start NEW modular backend\ncd /app && python3.11 mrrrme/backend_new.py &\n\n# Start avatar TTS\ncd /app/avatar && python3.11 speak_server.py &\n\n# Start Next.js frontend\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\n\nsleep 10\nnginx -g "daemon off;" &\necho "✅ All services running!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
 # Set ownership
 RUN chown -R 1000:1000 /app /data

 # Hugging Face Spaces - MrrrMe with Coqui XTTS v2 + MODULAR BACKEND
+# FIXED v2.3: SIMPLE & RELIABLE - Let emotion2vec download at runtime to /data
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
 # Install system dependencies
 WORKDIR /app
 # ============================================================
+# PERSISTENT STORAGE - emotion2vec will download here at runtime
 # ============================================================
 ENV MODELSCOPE_CACHE=/data/modelscope
 ENV MS_CACHE_HOME=/data/modelscope
+ENV HF_HOME=/data/huggingface
+# Create cache directories
+RUN mkdir -p /data/modelscope /data/huggingface /home/user/.cache
 # Install PyTorch with CUDA 11.8
 RUN python3.11 -m pip install --no-cache-dir \
 RUN python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
 # ============================================================
+# PRE-DOWNLOAD XTTS V2 ONLY (small, safe for build)
 # ============================================================
+RUN python3.11 -c "from TTS.utils.manage import ModelManager; print('⏳ Downloading XTTS v2...'); ModelManager().download_model('tts_models/multilingual/multi-dataset/xtts_v2'); print('✅ XTTS v2 cached')"
 # ============================================================
+# emotion2vec (1.8GB) will download at RUNTIME to /data
+# First run: ~2 min download (one-time)
+# Subsequent runs: Instant (loads from /data)
 # ============================================================
 # Install avatar dependencies
 RUN python3.11 -m pip install --no-cache-dir \
     -days 365 \
     -subj "/CN=mrrrme.hf.space"
+# ✅ Startup script - Sets cache to /data (persistent!)
+RUN printf '#!/bin/bash\nset -e\nexport HOME=/home/user\n\n# Agree to TOS\nexport COQUI_TOS_AGREED=1\n\n# ============================================================\n# PERSISTENT CACHE - emotion2vec downloads to /data (survives rebuilds)\n# ============================================================\nexport MODELSCOPE_CACHE=/data/modelscope\nexport MS_CACHE_HOME=/data/modelscope\nexport HF_HOME=/data/huggingface\n\necho "📍 Model cache: $MODELSCOPE_CACHE"\n\n# Clean up old processes\npkill -f "backend_new.py" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\n\nsleep 2\n\necho "🚀 Starting MrrrMe..."\n\n# Start services (emotion2vec downloads on first backend startup)\ncd /app && python3.11 mrrrme/backend_new.py &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\n\nsleep 10\nnginx -g "daemon off;" &\n\necho "✅ All services running!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
 # Set ownership
 RUN chown -R 1000:1000 /app /data

mrrrme/audio/voice_emotion.py CHANGED Viewed

@@ -5,105 +5,62 @@ Achieves 93.12% accuracy on RAVDESS benchmark (vs 76.8% with HuBERT)
 Model: iic/emotion2vec_plus_large
 Framework: FunASR
-FIXED v2.1:
-- Now uses /data/modelscope for persistent caching (survives rebuilds!)
-- Fallback to /home/user/.cache/modelscope for local dev
-- Intelligent cache detection with file count verification
-- No more 5+ minute re-downloads on code changes!
 """
 import os
 import time
 import threading
 from collections import deque
 import numpy as np
 import webrtcvad
 from ..config import AUDIO_SR, AUDIO_BLOCK, CLIP_SECONDS, VAD_AGGRESSIVENESS, FUSE4
 # ============================================================
-# FIXED v2: Persistent cache priority for HuggingFace Spaces
 # ============================================================
-# Priority order:
-# 1. /data/modelscope (HF Spaces persistent storage - SURVIVES REBUILDS!)
-# 2. /home/user/.cache/modelscope (Docker build cache - rebuilt on code changes)
-# 3. /tmp/modelscope (fallback for local dev)
-def find_model_cache():
-    """Find where emotion2vec model is cached"""
-    possible_locations = [
-        '/data/modelscope',  # HF Spaces persistent (BEST - survives rebuilds)
-        '/home/user/.cache/modelscope',  # Docker build cache
-        '/tmp/modelscope'  # Fallback
     ]
-    for cache_dir in possible_locations:
-        # Check if this directory has the model files
-        if os.path.exists(cache_dir):
-            # Look for model hub structure
-            hub_paths = [
-                os.path.join(cache_dir, 'hub'),
-                os.path.join(cache_dir, 'iic'),
-                cache_dir
-            ]
-            for hub_path in hub_paths:
-                if os.path.exists(hub_path):
-                    # Check if it actually has files
-                    try:
-                        file_count = sum(1 for _ in os.walk(hub_path))
-                        if file_count > 5:  # Has actual content
-                            print(f"[VoiceEmotion] 🎯 Found model at: {cache_dir}")
-                            return cache_dir
-                    except:
-                        pass
-    # No existing cache found - use /data if writable (for first-time download)
-    if os.path.exists('/data') and os.access('/data', os.W_OK):
-        print(f"[VoiceEmotion] 📦 /data writable - will download there (PERSISTENT)")
-        return '/data/modelscope'
-    # Fallback to home
-    print(f"[VoiceEmotion] 📦 Using /home/user/.cache (non-persistent)")
-    return '/home/user/.cache/modelscope'
-CACHE_DIR = find_model_cache()
-# Create cache directory if needed
-os.makedirs(CACHE_DIR, exist_ok=True)
-print(f"[VoiceEmotion] 📁 Cache directory: {CACHE_DIR}")
-# Check if model exists
-model_exists = False
-for check_path in [
-    os.path.join(CACHE_DIR, 'hub'),
-    os.path.join(CACHE_DIR, 'iic'),
-    CACHE_DIR
-]:
-    if os.path.exists(check_path):
-        try:
-            file_count = sum(1 for _ in os.walk(check_path))
-            if file_count > 10:
-                model_exists = True
-                print(f"[VoiceEmotion] ✅ Model found in cache ({file_count} files)")
-                break
-        except:
-            pass
-if not model_exists:
-    print(f"[VoiceEmotion] ⚠️ Model NOT cached - will download (~1.8GB, 5+ min)")
     if CACHE_DIR == '/data/modelscope':
-        print(f"[VoiceEmotion] 💡 First download to /data will persist forever!")
-    else:
-        print(f"[VoiceEmotion] ⚠️ Using non-persistent cache - will re-download on rebuild")
-        print(f"[VoiceEmotion] 💡 Enable persistent storage in HuggingFace Spaces settings")
-# Set ALL cache environment variables BEFORE importing funasr
-os.environ['MODELSCOPE_CACHE'] = CACHE_DIR
-os.environ['MS_CACHE_HOME'] = CACHE_DIR
-os.environ['FUNASR_CACHE'] = CACHE_DIR
-os.environ['HF_HOME'] = os.path.dirname(CACHE_DIR)  # Parent dir for HF
 class VoiceEmotionWorker:
@@ -115,8 +72,7 @@ class VoiceEmotionWorker:
         - VAD-based silence optimization
         - Pause/resume for TTS coordination
         - Thread-safe inference
-        - Compatible with Server Mode (WebSocket audio)
-        - PERSISTENT model caching (no re-downloads!)
     """
     # emotion2vec labels to our 4-class mapping
@@ -130,7 +86,16 @@ class VoiceEmotionWorker:
         'sad': 'Sad',
         'surprised': 'Neutral',
         'unknown': 'Neutral',
-        # Also handle uppercase variants (some versions return these)
         'Angry': 'Angry',
         'Disgusted': 'Angry',
         'Fearful': 'Sad',
@@ -144,32 +109,42 @@ class VoiceEmotionWorker:
     def __init__(self, whisper_worker=None, device=None):
         print("\n[VoiceEmotion] Initializing emotion2vec+ Large...")
-        print(f"[VoiceEmotion] Cache directory: {CACHE_DIR}")
-        # Load emotion2vec model
         try:
             from funasr import AutoModel
             load_start = time.time()
-            # ============================================================
-            # FIXED v2: Model loads from /data if available (persistent!)
-            # ============================================================
-            print(f"[VoiceEmotion] 📦 Loading model from cache...")
             self.model = AutoModel(
                 model="iic/emotion2vec_plus_large",
-                disable_update=True,  # Don't check for updates (we want cached version)
-                hub="ms"  # Explicitly use ModelScope hub
             )
             load_time = time.time() - load_start
-            print(f"[VoiceEmotion] ✅ Model loaded in {load_time:.1f}s")
-            # Show cache effectiveness
-            if CACHE_DIR == '/data/modelscope':
-                print("[VoiceEmotion] 🎉 Using PERSISTENT cache - model will survive rebuilds!")
             else:
-                print("[VoiceEmotion] ⚠️ Using temporary cache - model will reload on rebuild")
             print("[VoiceEmotion] Model: emotion2vec_plus_large (93% accuracy)")
@@ -178,9 +153,12 @@ class VoiceEmotionWorker:
         except ImportError:
             print("[VoiceEmotion] ❌ ERROR: funasr not installed")
-            print("[VoiceEmotion] Install with: pip install funasr modelscope")
             raise
         except Exception as e:
             print(f"[VoiceEmotion] ❌ ERROR: Failed to load model: {e}")
             import traceback
             traceback.print_exc()
@@ -230,7 +208,6 @@ class VoiceEmotionWorker:
         # Log configuration
         print(f"[VoiceEmotion] Configuration:")
         print(f"[VoiceEmotion]   Sample rate: {AUDIO_SR} Hz")
-        print(f"[VoiceEmotion]   Audio block: {AUDIO_BLOCK}s")
         print(f"[VoiceEmotion]   Clip length: {CLIP_SECONDS}s")
         print(f"[VoiceEmotion]   Ring buffer: {int(CLIP_SECONDS / AUDIO_BLOCK)} chunks")
@@ -240,34 +217,31 @@ class VoiceEmotionWorker:
         print("[VoiceEmotion] Ready\n")
     def _verify_model(self):
-        """Verify model works by running a test inference"""
         try:
-            # Create 1 second of white noise for test (not silence)
             test_audio = np.random.randn(AUDIO_SR).astype(np.float32) * 0.1
             result = self.model.generate(test_audio, granularity="utterance")
             if result and len(result) > 0:
                 scores = result[0].get("scores", [])
                 labels = result[0].get("labels", [])
-                if scores and labels:
-                    # Check if scores are NOT uniform (model working correctly)
-                    if len(set(scores)) > 1:
-                        print(f"[VoiceEmotion] ✅ Model verified - outputs {len(labels)} emotion classes")
-                        print(f"[VoiceEmotion]    Labels: {labels[:4]}...")
-                        print(f"[VoiceEmotion]    Sample scores: {[f'{s:.3f}' for s in scores[:4]]}")
-                    else:
-                        print("[VoiceEmotion] ⚠️ WARNING: Model returning uniform scores!")
-                        print("[VoiceEmotion]    This indicates decoder weights may not be loaded correctly")
-                        print("[VoiceEmotion]    Try: rm -rf /data/modelscope && restart")
                 else:
-                    print("[VoiceEmotion] ⚠️ Model loaded but returned empty scores/labels")
-            else:
-                print("[VoiceEmotion] ⚠️ Model loaded but test inference returned empty result")
         except Exception as e:
-            print(f"[VoiceEmotion] ⚠️ Model verification failed: {e}")
-            # Don't raise - model might still work for real audio
     def pause_listening(self):
         """Pause audio processing (called when TTS starts)"""
@@ -279,7 +253,6 @@ class VoiceEmotionWorker:
     def resume_listening(self):
         """Resume audio processing (called when TTS finishes)"""
-        # Clear Whisper buffer
         whisper_cleared = 0
         if self.whisper_worker:
             try:
@@ -289,7 +262,6 @@ class VoiceEmotionWorker:
             except Exception as e:
                 self._log_error(f"Error clearing Whisper buffer: {e}")
-        # Clear emotion buffer
         emotion_cleared = len(self.ring)
         self.ring.clear()
         self.speech_chunks_count = 0
@@ -301,7 +273,7 @@ class VoiceEmotionWorker:
         print(f"[VoiceEmotion] ▶️ RESUMED (cleared {whisper_cleared + emotion_cleared} chunks)")
     def _log_error(self, message):
-        """Log errors with rate limiting to avoid spam"""
         current_time = time.time()
         if message != self.last_error_message or current_time - self.last_error_time > 5.0:
             print(f"[VoiceEmotion] ⚠️ {message}")
@@ -309,15 +281,9 @@ class VoiceEmotionWorker:
             self.last_error_time = current_time
     def add_audio(self, audio_data):
-        """
-        Add audio data for processing.
-        Args:
-            audio_data: float32 numpy array, mono, 16kHz
-        """
-        # Auto-start inference thread on first audio (Server Mode)
         if not self.running:
-            print("[VoiceEmotion] Auto-starting inference thread (Server Mode)")
             self.running = True
             self._start_inference_thread()
@@ -326,8 +292,7 @@ class VoiceEmotionWorker:
         except Exception as e:
             self._log_error(f"Audio processing error: {e}")
-    # Alias for backward compatibility
-    process_external_audio = add_audio
     def _process_audio_chunk(self, mono_data):
         """Process incoming audio chunk with VAD"""
@@ -378,23 +343,20 @@ class VoiceEmotionWorker:
         return chunks
     def _run_inference(self, audio_clip):
-        """
-        Run emotion2vec inference on audio clip.
-        Args:
-            audio_clip: float32 numpy array
-        Returns:
-            (probs_4class, top_emotion, confidence)
-        """
         try:
-            # emotion2vec expects 16kHz audio
             result = self.model.generate(audio_clip, granularity="utterance")
             if not result or len(result) == 0:
                 return None, "Neutral", 0.0
-            # Extract scores from result
             scores = result[0].get("scores", [])
             labels = result[0].get("labels", [])
@@ -405,7 +367,7 @@ class VoiceEmotionWorker:
             probs_4class = np.zeros(len(FUSE4), dtype=np.float32)
             for label, score in zip(labels, scores):
-                # Handle both lowercase and original case
                 label_key = label.lower() if label.lower() in self.EMOTION_MAP else label
                 if label_key in self.EMOTION_MAP:
@@ -420,7 +382,6 @@ class VoiceEmotionWorker:
             else:
                 probs_4class = np.full(len(FUSE4), 0.25, dtype=np.float32)
-            # Get top emotion
             top_idx = int(np.argmax(probs_4class))
             top_emotion = FUSE4[top_idx]
             confidence = float(probs_4class[top_idx])
@@ -428,6 +389,7 @@ class VoiceEmotionWorker:
             return probs_4class, top_emotion, confidence
         except Exception as e:
             self._log_error(f"Inference error: {e}")
             return None, "Neutral", 0.0
@@ -443,7 +405,7 @@ class VoiceEmotionWorker:
             loop_count += 1
             current_time = time.time()
-            # Adaptive interval: faster when speaking, slower when idle
             min_interval = 0.5 if self._is_speech_active() else 1.0
             if current_time - last_inference_time < min_interval:
                 time.sleep(0.05)
@@ -451,22 +413,22 @@ class VoiceEmotionWorker:
             last_inference_time = current_time
-            # Heartbeat logging (every ~10 seconds)
             if loop_count % 200 == 0:
                 with self.lock:
                     emotion = self.current_label
                     conf = self.last_confidence
                 total = self.inference_count + self.skipped_inferences
                 efficiency = (self.inference_count / total * 100) if total > 0 else 0
-                print(f"[VoiceEmotion] 💓 Heartbeat: {emotion} ({conf:.2f}), active={self._is_speech_active()}, efficiency={efficiency:.1f}%")
-            # Check pause state
             with self.pause_lock:
                 if self.paused:
                     time.sleep(0.1)
                     continue
-            # Skip if no speech activity
             if not self._is_speech_active():
                 self.skipped_inferences += 1
                 if not idle_logged:
@@ -485,7 +447,6 @@ class VoiceEmotionWorker:
                 self.skipped_inferences += 1
                 continue
-            # Decay speech count
             self.speech_chunks_count = max(0, self.speech_chunks_count - 1)
             # Prepare audio clip
@@ -504,9 +465,8 @@ class VoiceEmotionWorker:
             if probs is not None:
                 with self.lock:
                     old_label = self.current_label
-                    old_conf = self.last_confidence
-                    # Smooth probabilities (70% old + 30% new)
                     self.current_probs = 0.7 * self.current_probs + 0.3 * probs
                     self.current_label = FUSE4[int(self.current_probs.argmax())]
                     self.last_confidence = float(self.current_probs.max())
@@ -517,12 +477,12 @@ class VoiceEmotionWorker:
                 # Log emotion changes
                 if new_label != old_label:
-                    print(f"[VoiceEmotion] 🎭 Emotion: {old_label} → {new_label} (conf={self.last_confidence:.2f})")
         print("[VoiceEmotion] ⏹️ Inference loop stopped")
     def _start_inference_thread(self):
-        """Start the background inference thread"""
         try:
             self.th = threading.Thread(target=self._infer_loop, daemon=True)
             self.th.start()
@@ -566,7 +526,6 @@ class VoiceEmotionWorker:
         mono = indata[:, 0] if indata.ndim > 1 else indata
-        # Forward to Whisper if linked
         if self.whisper_worker is not None:
             try:
                 self.whisper_worker.add_audio(mono.copy())
@@ -591,22 +550,12 @@ class VoiceEmotionWorker:
                 pass
     def get_probs(self):
-        """
-        Get current emotion probabilities.
-        Returns:
-            (probs: np.array[4], label: str)
-        """
         with self.lock:
             return self.current_probs.copy(), self.current_label
     def get_state(self):
-        """
-        Get full worker state for debugging.
-        Returns:
-            dict with running, speech_active, emotion info, stats
-        """
         with self.lock:
             probs = self.current_probs.copy()
             label = self.current_label

 Model: iic/emotion2vec_plus_large
 Framework: FunASR
+FIXED v2.3:
+- Simplified: Downloads at runtime to /data (persistent)
+- Suppressed verbose ModelScope download logs
+- Clean logging output
 """
 import os
+import sys
 import time
 import threading
 from collections import deque
 import numpy as np
 import webrtcvad
+# Suppress verbose download logs BEFORE importing funasr
+import logging
+logging.getLogger('modelscope').setLevel(logging.WARNING)
+logging.getLogger('funasr').setLevel(logging.WARNING)
 from ..config import AUDIO_SR, AUDIO_BLOCK, CLIP_SECONDS, VAD_AGGRESSIVENESS, FUSE4
 # ============================================================
+# PERSISTENT CACHE - Use /data if available (survives rebuilds!)
 # ============================================================
+CACHE_DIR = '/data/modelscope' if os.path.exists('/data') else '/home/user/.cache/modelscope'
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Set cache environment variables
+os.environ['MODELSCOPE_CACHE'] = CACHE_DIR
+os.environ['MS_CACHE_HOME'] = CACHE_DIR
+os.environ['HF_HOME'] = os.path.dirname(CACHE_DIR)
+# Check if model is cached
+def check_model_cached():
+    """Check if emotion2vec is already downloaded"""
+    model_paths = [
+        os.path.join(CACHE_DIR, 'models', 'iic', 'emotion2vec_plus_large'),
+        os.path.join(CACHE_DIR, 'hub', 'iic', 'emotion2vec_plus_large'),
     ]
+    for path in model_paths:
+        if os.path.exists(path):
+            model_file = os.path.join(path, 'model.pt')
+            if os.path.exists(model_file):
+                return True, path
+    return False, None
+is_cached, cache_path = check_model_cached()
+if is_cached:
+    print(f"[VoiceEmotion] ✅ Model cached at: {cache_path}")
+else:
+    print(f"[VoiceEmotion] 📥 First run - downloading to {CACHE_DIR} (~2 min)")
     if CACHE_DIR == '/data/modelscope':
+        print(f"[VoiceEmotion] 💡 Using persistent storage - will only download once!")
 class VoiceEmotionWorker:
         - VAD-based silence optimization
         - Pause/resume for TTS coordination
         - Thread-safe inference
+        - PERSISTENT caching to /data (survives rebuilds)
     """
     # emotion2vec labels to our 4-class mapping
         'sad': 'Sad',
         'surprised': 'Neutral',
         'unknown': 'Neutral',
+        # Also handle different language variants
+        '生气/angry': 'Angry',
+        '厌恶/disgusted': 'Angry',
+        '恐惧/fearful': 'Sad',
+        '开心/happy': 'Happy',
+        '中立/neutral': 'Neutral',
+        '其他/other': 'Neutral',
+        '难过/sad': 'Sad',
+        '吃惊/surprised': 'Neutral',
+        # Uppercase variants
         'Angry': 'Angry',
         'Disgusted': 'Angry',
         'Fearful': 'Sad',
     def __init__(self, whisper_worker=None, device=None):
         print("\n[VoiceEmotion] Initializing emotion2vec+ Large...")
+        # Load emotion2vec model with suppressed logs
         try:
+            # Suppress ModelScope verbose output
+            old_stdout = sys.stdout
+            old_stderr = sys.stderr
+            # Only suppress if not cached
+            if not is_cached:
+                sys.stdout = open(os.devnull, 'w')
+                sys.stderr = open(os.devnull, 'w')
             from funasr import AutoModel
             load_start = time.time()
             self.model = AutoModel(
                 model="iic/emotion2vec_plus_large",
+                disable_update=True,
+                hub="ms"
             )
+            # Restore output
+            if not is_cached:
+                sys.stdout.close()
+                sys.stderr.close()
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
             load_time = time.time() - load_start
+            if is_cached:
+                print(f"[VoiceEmotion] ✅ Loaded from cache in {load_time:.1f}s")
             else:
+                print(f"[VoiceEmotion] ✅ Downloaded + loaded in {load_time:.1f}s")
+                print(f"[VoiceEmotion] 🎉 Cached to {CACHE_DIR} - future runs will be instant!")
             print("[VoiceEmotion] Model: emotion2vec_plus_large (93% accuracy)")
         except ImportError:
             print("[VoiceEmotion] ❌ ERROR: funasr not installed")
+            print("[VoiceEmotion] Install with: pip install funasr")
             raise
         except Exception as e:
+            # Restore output on error
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
             print(f"[VoiceEmotion] ❌ ERROR: Failed to load model: {e}")
             import traceback
             traceback.print_exc()
         # Log configuration
         print(f"[VoiceEmotion] Configuration:")
         print(f"[VoiceEmotion]   Sample rate: {AUDIO_SR} Hz")
         print(f"[VoiceEmotion]   Clip length: {CLIP_SECONDS}s")
         print(f"[VoiceEmotion]   Ring buffer: {int(CLIP_SECONDS / AUDIO_BLOCK)} chunks")
         print("[VoiceEmotion] Ready\n")
     def _verify_model(self):
+        """Verify model works with quick test inference"""
         try:
             test_audio = np.random.randn(AUDIO_SR).astype(np.float32) * 0.1
+            # Suppress test inference logs
+            old_stdout = sys.stdout
+            sys.stdout = open(os.devnull, 'w')
             result = self.model.generate(test_audio, granularity="utterance")
+            sys.stdout.close()
+            sys.stdout = old_stdout
             if result and len(result) > 0:
                 scores = result[0].get("scores", [])
                 labels = result[0].get("labels", [])
+                if scores and labels and len(set(scores)) > 1:
+                    print(f"[VoiceEmotion] ✅ Model verified ({len(labels)} emotion classes)")
                 else:
+                    print("[VoiceEmotion] ⚠️ Model loaded but may have issues")
         except Exception as e:
+            sys.stdout = old_stdout
+            print(f"[VoiceEmotion] ⚠️ Verification failed: {e}")
     def pause_listening(self):
         """Pause audio processing (called when TTS starts)"""
     def resume_listening(self):
         """Resume audio processing (called when TTS finishes)"""
         whisper_cleared = 0
         if self.whisper_worker:
             try:
             except Exception as e:
                 self._log_error(f"Error clearing Whisper buffer: {e}")
         emotion_cleared = len(self.ring)
         self.ring.clear()
         self.speech_chunks_count = 0
         print(f"[VoiceEmotion] ▶️ RESUMED (cleared {whisper_cleared + emotion_cleared} chunks)")
     def _log_error(self, message):
+        """Log errors with rate limiting"""
         current_time = time.time()
         if message != self.last_error_message or current_time - self.last_error_time > 5.0:
             print(f"[VoiceEmotion] ⚠️ {message}")
             self.last_error_time = current_time
     def add_audio(self, audio_data):
+        """Add audio data for processing"""
         if not self.running:
+            print("[VoiceEmotion] Auto-starting inference thread")
             self.running = True
             self._start_inference_thread()
         except Exception as e:
             self._log_error(f"Audio processing error: {e}")
+    process_external_audio = add_audio  # Alias
     def _process_audio_chunk(self, mono_data):
         """Process incoming audio chunk with VAD"""
         return chunks
     def _run_inference(self, audio_clip):
+        """Run emotion2vec inference (with suppressed logs)"""
         try:
+            # Suppress inference logs
+            old_stdout = sys.stdout
+            sys.stdout = open(os.devnull, 'w')
             result = self.model.generate(audio_clip, granularity="utterance")
+            sys.stdout.close()
+            sys.stdout = old_stdout
             if not result or len(result) == 0:
                 return None, "Neutral", 0.0
             scores = result[0].get("scores", [])
             labels = result[0].get("labels", [])
             probs_4class = np.zeros(len(FUSE4), dtype=np.float32)
             for label, score in zip(labels, scores):
+                # Try lowercase first, then original
                 label_key = label.lower() if label.lower() in self.EMOTION_MAP else label
                 if label_key in self.EMOTION_MAP:
             else:
                 probs_4class = np.full(len(FUSE4), 0.25, dtype=np.float32)
             top_idx = int(np.argmax(probs_4class))
             top_emotion = FUSE4[top_idx]
             confidence = float(probs_4class[top_idx])
             return probs_4class, top_emotion, confidence
         except Exception as e:
+            sys.stdout = old_stdout
             self._log_error(f"Inference error: {e}")
             return None, "Neutral", 0.0
             loop_count += 1
             current_time = time.time()
+            # Adaptive interval
             min_interval = 0.5 if self._is_speech_active() else 1.0
             if current_time - last_inference_time < min_interval:
                 time.sleep(0.05)
             last_inference_time = current_time
+            # Heartbeat (every ~10 seconds)
             if loop_count % 200 == 0:
                 with self.lock:
                     emotion = self.current_label
                     conf = self.last_confidence
                 total = self.inference_count + self.skipped_inferences
                 efficiency = (self.inference_count / total * 100) if total > 0 else 0
+                print(f"[VoiceEmotion] 💓 Heartbeat: {emotion} ({conf:.2f}), efficiency={efficiency:.1f}%")
+            # Check pause
             with self.pause_lock:
                 if self.paused:
                     time.sleep(0.1)
                     continue
+            # Skip if no speech
             if not self._is_speech_active():
                 self.skipped_inferences += 1
                 if not idle_logged:
                 self.skipped_inferences += 1
                 continue
             self.speech_chunks_count = max(0, self.speech_chunks_count - 1)
             # Prepare audio clip
             if probs is not None:
                 with self.lock:
                     old_label = self.current_label
+                    # Smooth probabilities
                     self.current_probs = 0.7 * self.current_probs + 0.3 * probs
                     self.current_label = FUSE4[int(self.current_probs.argmax())]
                     self.last_confidence = float(self.current_probs.max())
                 # Log emotion changes
                 if new_label != old_label:
+                    print(f"[VoiceEmotion] 🎭 {old_label} → {new_label} (conf={self.last_confidence:.2f})")
         print("[VoiceEmotion] ⏹️ Inference loop stopped")
     def _start_inference_thread(self):
+        """Start background inference thread"""
         try:
             self.th = threading.Thread(target=self._infer_loop, daemon=True)
             self.th.start()
         mono = indata[:, 0] if indata.ndim > 1 else indata
         if self.whisper_worker is not None:
             try:
                 self.whisper_worker.add_audio(mono.copy())
                 pass
     def get_probs(self):
+        """Get current emotion probabilities"""
         with self.lock:
             return self.current_probs.copy(), self.current_label
     def get_state(self):
+        """Get worker state for debugging"""
         with self.lock:
             probs = self.current_probs.copy()
             label = self.current_label