MichonGoddijn231849 commited on
Commit
1af2a91
·
1 Parent(s): defeda7

fix downloading emotion2vec3

Browse files
Files changed (2) hide show
  1. Dockerfile +13 -38
  2. mrrrme/audio/voice_emotion.py +113 -164
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
  # Hugging Face Spaces - MrrrMe with Coqui XTTS v2 + MODULAR BACKEND
2
- # FIXED v2.2: emotion2vec downloads WITHOUT loading into RAM (prevents OOM)
3
  FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
4
 
5
  # Install system dependencies
@@ -36,11 +36,14 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 &
36
  WORKDIR /app
37
 
38
  # ============================================================
39
- # MODEL CACHING - Use persistent storage for large models
40
  # ============================================================
41
- # HuggingFace Spaces persistent storage: /data (survives rebuilds!)
42
  ENV MODELSCOPE_CACHE=/data/modelscope
43
  ENV MS_CACHE_HOME=/data/modelscope
 
 
 
 
44
 
45
  # Install PyTorch with CUDA 11.8
46
  RUN python3.11 -m pip install --no-cache-dir \
@@ -56,43 +59,15 @@ COPY requirements_docker.txt ./
56
  RUN python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
57
 
58
  # ============================================================
59
- # PRE-DOWNLOAD MODELS DURING BUILD (for fast startup!)
60
  # ============================================================
61
-
62
- # PRE-DOWNLOAD XTTS V2 MODEL (Memory Safe)
63
- # Uses ModelManager to download without loading to RAM (Fixes Exit 137)
64
- RUN python3.11 -c "from TTS.utils.manage import ModelManager; print('⏳ Downloading XTTS v2 model...'); ModelManager().download_model('tts_models/multilingual/multi-dataset/xtts_v2'); print('✅ XTTS v2 downloaded.')"
65
 
66
  # ============================================================
67
- # FIXED v2.2: Download emotion2vec WITHOUT loading into memory
 
 
68
  # ============================================================
69
- # Problem: AutoModel() loads 1.8GB into RAM during build → OOM crash (exit 137)
70
- # Solution: Use ModelScope SDK to download files only, no model instantiation
71
- #
72
- # Strategy:
73
- # 1. Download model files to /data/modelscope (persistent storage)
74
- # 2. Model will be loaded at RUNTIME (when container has more memory)
75
- # 3. Subsequent rebuilds skip download (files already in /data)
76
-
77
- RUN python3.11 -c "\
78
- import os; \
79
- os.makedirs('/data/modelscope', exist_ok=True); \
80
- os.environ['MODELSCOPE_CACHE'] = '/data/modelscope'; \
81
- os.environ['MS_CACHE_HOME'] = '/data/modelscope'; \
82
- print('⏳ Downloading emotion2vec+ Large files (1.8GB)...'); \
83
- print('💡 Files only - model will load at runtime to avoid OOM'); \
84
- from modelscope.hub.snapshot_download import snapshot_download; \
85
- model_dir = snapshot_download('iic/emotion2vec_plus_large', cache_dir='/data/modelscope'); \
86
- print(f'✅ Downloaded to: {model_dir}'); \
87
- import glob; \
88
- files = glob.glob(model_dir + '/**/*', recursive=True); \
89
- print(f'📦 Downloaded {len(files)} files')"
90
-
91
- # Also create fallback cache for local dev
92
- RUN mkdir -p /home/user/.cache/modelscope && \
93
- cp -r /data/modelscope/* /home/user/.cache/modelscope/ 2>/dev/null || true && \
94
- chown -R 1000:1000 /home/user && \
95
- echo "✅ Model cached in /data/modelscope (PERSISTENT)"
96
 
97
  # Install avatar dependencies
98
  RUN python3.11 -m pip install --no-cache-dir \
@@ -134,8 +109,8 @@ RUN mkdir -p /etc/nginx/certs && \
134
  -days 365 \
135
  -subj "/CN=mrrrme.hf.space"
136
 
137
- # ✅ Create startup script with PERSISTENT CACHE
138
- RUN printf '#!/bin/bash\nset -e\nexport HOME=/home/user\nmkdir -p /tmp\n\n# Agree to TOS\nexport COQUI_TOS_AGREED=1\n\n# ============================================================\n# emotion2vec cache - Use /data if available (persistent!)\n# ============================================================\nif [ -d "/data/modelscope" ] && [ -n "$(ls -A /data/modelscope 2>/dev/null)" ]; then\n echo "✅ Using persistent storage: /data/modelscope"\n export MODELSCOPE_CACHE=/data/modelscope\n export MS_CACHE_HOME=/data/modelscope\n du -sh /data/modelscope 2>/dev/null || echo "Calculating size..."\nelse\n echo "⚠️ Persistent storage not available, using /home/user/.cache/modelscope"\n export MODELSCOPE_CACHE=/home/user/.cache/modelscope\n export MS_CACHE_HOME=/home/user/.cache/modelscope\nfi\n\necho "📍 Model cache: $MODELSCOPE_CACHE"\n\npkill -f "backend_new.py" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\n\nsleep 2\necho "🚀 Starting MrrrMe (XTTS v2 + emotion2vec + Modular Backend v2.2)..."\necho "📦 emotion2vec will load from cache (NO download)"\n\n# Start NEW modular backend\ncd /app && python3.11 mrrrme/backend_new.py &\n\n# Start avatar TTS\ncd /app/avatar && python3.11 speak_server.py &\n\n# Start Next.js frontend\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\n\nsleep 10\nnginx -g "daemon off;" &\necho "✅ All services running!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
139
 
140
  # Set ownership
141
  RUN chown -R 1000:1000 /app /data
 
1
  # Hugging Face Spaces - MrrrMe with Coqui XTTS v2 + MODULAR BACKEND
2
+ # FIXED v2.3: SIMPLE & RELIABLE - Let emotion2vec download at runtime to /data
3
  FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
4
 
5
  # Install system dependencies
 
36
  WORKDIR /app
37
 
38
  # ============================================================
39
+ # PERSISTENT STORAGE - emotion2vec will download here at runtime
40
  # ============================================================
 
41
  ENV MODELSCOPE_CACHE=/data/modelscope
42
  ENV MS_CACHE_HOME=/data/modelscope
43
+ ENV HF_HOME=/data/huggingface
44
+
45
+ # Create cache directories
46
+ RUN mkdir -p /data/modelscope /data/huggingface /home/user/.cache
47
 
48
  # Install PyTorch with CUDA 11.8
49
  RUN python3.11 -m pip install --no-cache-dir \
 
59
  RUN python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
60
 
61
  # ============================================================
62
+ # PRE-DOWNLOAD XTTS V2 ONLY (small, safe for build)
63
  # ============================================================
64
+ RUN python3.11 -c "from TTS.utils.manage import ModelManager; print('⏳ Downloading XTTS v2...'); ModelManager().download_model('tts_models/multilingual/multi-dataset/xtts_v2'); print('✅ XTTS v2 cached')"
 
 
 
65
 
66
  # ============================================================
67
+ # emotion2vec (1.8GB) will download at RUNTIME to /data
68
+ # First run: ~2 min download (one-time)
69
+ # Subsequent runs: Instant (loads from /data)
70
  # ============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Install avatar dependencies
73
  RUN python3.11 -m pip install --no-cache-dir \
 
109
  -days 365 \
110
  -subj "/CN=mrrrme.hf.space"
111
 
112
+ # ✅ Startup script - Sets cache to /data (persistent!)
113
+ RUN printf '#!/bin/bash\nset -e\nexport HOME=/home/user\n\n# Agree to TOS\nexport COQUI_TOS_AGREED=1\n\n# ============================================================\n# PERSISTENT CACHE - emotion2vec downloads to /data (survives rebuilds)\n# ============================================================\nexport MODELSCOPE_CACHE=/data/modelscope\nexport MS_CACHE_HOME=/data/modelscope\nexport HF_HOME=/data/huggingface\n\necho "📍 Model cache: $MODELSCOPE_CACHE"\n\n# Clean up old processes\npkill -f "backend_new.py" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\n\nsleep 2\n\necho "🚀 Starting MrrrMe..."\n\n# Start services (emotion2vec downloads on first backend startup)\ncd /app && python3.11 mrrrme/backend_new.py &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\n\nsleep 10\nnginx -g "daemon off;" &\n\necho "✅ All services running!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
114
 
115
  # Set ownership
116
  RUN chown -R 1000:1000 /app /data
mrrrme/audio/voice_emotion.py CHANGED
@@ -5,105 +5,62 @@ Achieves 93.12% accuracy on RAVDESS benchmark (vs 76.8% with HuBERT)
5
  Model: iic/emotion2vec_plus_large
6
  Framework: FunASR
7
 
8
- FIXED v2.1:
9
- - Now uses /data/modelscope for persistent caching (survives rebuilds!)
10
- - Fallback to /home/user/.cache/modelscope for local dev
11
- - Intelligent cache detection with file count verification
12
- - No more 5+ minute re-downloads on code changes!
13
  """
14
 
15
  import os
 
16
  import time
17
  import threading
18
  from collections import deque
19
  import numpy as np
20
  import webrtcvad
21
 
 
 
 
 
 
22
  from ..config import AUDIO_SR, AUDIO_BLOCK, CLIP_SECONDS, VAD_AGGRESSIVENESS, FUSE4
23
 
24
  # ============================================================
25
- # FIXED v2: Persistent cache priority for HuggingFace Spaces
26
  # ============================================================
27
- # Priority order:
28
- # 1. /data/modelscope (HF Spaces persistent storage - SURVIVES REBUILDS!)
29
- # 2. /home/user/.cache/modelscope (Docker build cache - rebuilt on code changes)
30
- # 3. /tmp/modelscope (fallback for local dev)
 
 
 
31
 
32
- def find_model_cache():
33
- """Find where emotion2vec model is cached"""
34
- possible_locations = [
35
- '/data/modelscope', # HF Spaces persistent (BEST - survives rebuilds)
36
- '/home/user/.cache/modelscope', # Docker build cache
37
- '/tmp/modelscope' # Fallback
38
  ]
39
 
40
- for cache_dir in possible_locations:
41
- # Check if this directory has the model files
42
- if os.path.exists(cache_dir):
43
- # Look for model hub structure
44
- hub_paths = [
45
- os.path.join(cache_dir, 'hub'),
46
- os.path.join(cache_dir, 'iic'),
47
- cache_dir
48
- ]
49
-
50
- for hub_path in hub_paths:
51
- if os.path.exists(hub_path):
52
- # Check if it actually has files
53
- try:
54
- file_count = sum(1 for _ in os.walk(hub_path))
55
- if file_count > 5: # Has actual content
56
- print(f"[VoiceEmotion] 🎯 Found model at: {cache_dir}")
57
- return cache_dir
58
- except:
59
- pass
60
-
61
- # No existing cache found - use /data if writable (for first-time download)
62
- if os.path.exists('/data') and os.access('/data', os.W_OK):
63
- print(f"[VoiceEmotion] 📦 /data writable - will download there (PERSISTENT)")
64
- return '/data/modelscope'
65
 
66
- # Fallback to home
67
- print(f"[VoiceEmotion] 📦 Using /home/user/.cache (non-persistent)")
68
- return '/home/user/.cache/modelscope'
69
 
70
- CACHE_DIR = find_model_cache()
71
-
72
- # Create cache directory if needed
73
- os.makedirs(CACHE_DIR, exist_ok=True)
74
 
75
- print(f"[VoiceEmotion] 📁 Cache directory: {CACHE_DIR}")
76
-
77
- # Check if model exists
78
- model_exists = False
79
- for check_path in [
80
- os.path.join(CACHE_DIR, 'hub'),
81
- os.path.join(CACHE_DIR, 'iic'),
82
- CACHE_DIR
83
- ]:
84
- if os.path.exists(check_path):
85
- try:
86
- file_count = sum(1 for _ in os.walk(check_path))
87
- if file_count > 10:
88
- model_exists = True
89
- print(f"[VoiceEmotion] ✅ Model found in cache ({file_count} files)")
90
- break
91
- except:
92
- pass
93
-
94
- if not model_exists:
95
- print(f"[VoiceEmotion] ⚠️ Model NOT cached - will download (~1.8GB, 5+ min)")
96
  if CACHE_DIR == '/data/modelscope':
97
- print(f"[VoiceEmotion] 💡 First download to /data will persist forever!")
98
- else:
99
- print(f"[VoiceEmotion] ⚠️ Using non-persistent cache - will re-download on rebuild")
100
- print(f"[VoiceEmotion] 💡 Enable persistent storage in HuggingFace Spaces settings")
101
-
102
- # Set ALL cache environment variables BEFORE importing funasr
103
- os.environ['MODELSCOPE_CACHE'] = CACHE_DIR
104
- os.environ['MS_CACHE_HOME'] = CACHE_DIR
105
- os.environ['FUNASR_CACHE'] = CACHE_DIR
106
- os.environ['HF_HOME'] = os.path.dirname(CACHE_DIR) # Parent dir for HF
107
 
108
 
109
  class VoiceEmotionWorker:
@@ -115,8 +72,7 @@ class VoiceEmotionWorker:
115
  - VAD-based silence optimization
116
  - Pause/resume for TTS coordination
117
  - Thread-safe inference
118
- - Compatible with Server Mode (WebSocket audio)
119
- - PERSISTENT model caching (no re-downloads!)
120
  """
121
 
122
  # emotion2vec labels to our 4-class mapping
@@ -130,7 +86,16 @@ class VoiceEmotionWorker:
130
  'sad': 'Sad',
131
  'surprised': 'Neutral',
132
  'unknown': 'Neutral',
133
- # Also handle uppercase variants (some versions return these)
 
 
 
 
 
 
 
 
 
134
  'Angry': 'Angry',
135
  'Disgusted': 'Angry',
136
  'Fearful': 'Sad',
@@ -144,32 +109,42 @@ class VoiceEmotionWorker:
144
 
145
  def __init__(self, whisper_worker=None, device=None):
146
  print("\n[VoiceEmotion] Initializing emotion2vec+ Large...")
147
- print(f"[VoiceEmotion] Cache directory: {CACHE_DIR}")
148
 
149
- # Load emotion2vec model
150
  try:
 
 
 
 
 
 
 
 
 
151
  from funasr import AutoModel
152
 
153
  load_start = time.time()
154
 
155
- # ============================================================
156
- # FIXED v2: Model loads from /data if available (persistent!)
157
- # ============================================================
158
- print(f"[VoiceEmotion] 📦 Loading model from cache...")
159
  self.model = AutoModel(
160
  model="iic/emotion2vec_plus_large",
161
- disable_update=True, # Don't check for updates (we want cached version)
162
- hub="ms" # Explicitly use ModelScope hub
163
  )
164
 
 
 
 
 
 
 
 
165
  load_time = time.time() - load_start
166
- print(f"[VoiceEmotion] ✅ Model loaded in {load_time:.1f}s")
167
 
168
- # Show cache effectiveness
169
- if CACHE_DIR == '/data/modelscope':
170
- print("[VoiceEmotion] 🎉 Using PERSISTENT cache - model will survive rebuilds!")
171
  else:
172
- print("[VoiceEmotion] ⚠️ Using temporary cache - model will reload on rebuild")
 
173
 
174
  print("[VoiceEmotion] Model: emotion2vec_plus_large (93% accuracy)")
175
 
@@ -178,9 +153,12 @@ class VoiceEmotionWorker:
178
 
179
  except ImportError:
180
  print("[VoiceEmotion] ❌ ERROR: funasr not installed")
181
- print("[VoiceEmotion] Install with: pip install funasr modelscope")
182
  raise
183
  except Exception as e:
 
 
 
184
  print(f"[VoiceEmotion] ❌ ERROR: Failed to load model: {e}")
185
  import traceback
186
  traceback.print_exc()
@@ -230,7 +208,6 @@ class VoiceEmotionWorker:
230
  # Log configuration
231
  print(f"[VoiceEmotion] Configuration:")
232
  print(f"[VoiceEmotion] Sample rate: {AUDIO_SR} Hz")
233
- print(f"[VoiceEmotion] Audio block: {AUDIO_BLOCK}s")
234
  print(f"[VoiceEmotion] Clip length: {CLIP_SECONDS}s")
235
  print(f"[VoiceEmotion] Ring buffer: {int(CLIP_SECONDS / AUDIO_BLOCK)} chunks")
236
 
@@ -240,34 +217,31 @@ class VoiceEmotionWorker:
240
  print("[VoiceEmotion] Ready\n")
241
 
242
  def _verify_model(self):
243
- """Verify model works by running a test inference"""
244
  try:
245
- # Create 1 second of white noise for test (not silence)
246
  test_audio = np.random.randn(AUDIO_SR).astype(np.float32) * 0.1
 
 
 
 
 
247
  result = self.model.generate(test_audio, granularity="utterance")
248
 
 
 
 
249
  if result and len(result) > 0:
250
  scores = result[0].get("scores", [])
251
  labels = result[0].get("labels", [])
252
 
253
- if scores and labels:
254
- # Check if scores are NOT uniform (model working correctly)
255
- if len(set(scores)) > 1:
256
- print(f"[VoiceEmotion] ✅ Model verified - outputs {len(labels)} emotion classes")
257
- print(f"[VoiceEmotion] Labels: {labels[:4]}...")
258
- print(f"[VoiceEmotion] Sample scores: {[f'{s:.3f}' for s in scores[:4]]}")
259
- else:
260
- print("[VoiceEmotion] ⚠️ WARNING: Model returning uniform scores!")
261
- print("[VoiceEmotion] This indicates decoder weights may not be loaded correctly")
262
- print("[VoiceEmotion] Try: rm -rf /data/modelscope && restart")
263
  else:
264
- print("[VoiceEmotion] ⚠️ Model loaded but returned empty scores/labels")
265
- else:
266
- print("[VoiceEmotion] ⚠️ Model loaded but test inference returned empty result")
267
-
268
  except Exception as e:
269
- print(f"[VoiceEmotion] ⚠️ Model verification failed: {e}")
270
- # Don't raise - model might still work for real audio
271
 
272
  def pause_listening(self):
273
  """Pause audio processing (called when TTS starts)"""
@@ -279,7 +253,6 @@ class VoiceEmotionWorker:
279
 
280
  def resume_listening(self):
281
  """Resume audio processing (called when TTS finishes)"""
282
- # Clear Whisper buffer
283
  whisper_cleared = 0
284
  if self.whisper_worker:
285
  try:
@@ -289,7 +262,6 @@ class VoiceEmotionWorker:
289
  except Exception as e:
290
  self._log_error(f"Error clearing Whisper buffer: {e}")
291
 
292
- # Clear emotion buffer
293
  emotion_cleared = len(self.ring)
294
  self.ring.clear()
295
  self.speech_chunks_count = 0
@@ -301,7 +273,7 @@ class VoiceEmotionWorker:
301
  print(f"[VoiceEmotion] ▶️ RESUMED (cleared {whisper_cleared + emotion_cleared} chunks)")
302
 
303
  def _log_error(self, message):
304
- """Log errors with rate limiting to avoid spam"""
305
  current_time = time.time()
306
  if message != self.last_error_message or current_time - self.last_error_time > 5.0:
307
  print(f"[VoiceEmotion] ⚠️ {message}")
@@ -309,15 +281,9 @@ class VoiceEmotionWorker:
309
  self.last_error_time = current_time
310
 
311
  def add_audio(self, audio_data):
312
- """
313
- Add audio data for processing.
314
-
315
- Args:
316
- audio_data: float32 numpy array, mono, 16kHz
317
- """
318
- # Auto-start inference thread on first audio (Server Mode)
319
  if not self.running:
320
- print("[VoiceEmotion] Auto-starting inference thread (Server Mode)")
321
  self.running = True
322
  self._start_inference_thread()
323
 
@@ -326,8 +292,7 @@ class VoiceEmotionWorker:
326
  except Exception as e:
327
  self._log_error(f"Audio processing error: {e}")
328
 
329
- # Alias for backward compatibility
330
- process_external_audio = add_audio
331
 
332
  def _process_audio_chunk(self, mono_data):
333
  """Process incoming audio chunk with VAD"""
@@ -378,23 +343,20 @@ class VoiceEmotionWorker:
378
  return chunks
379
 
380
  def _run_inference(self, audio_clip):
381
- """
382
- Run emotion2vec inference on audio clip.
383
-
384
- Args:
385
- audio_clip: float32 numpy array
386
-
387
- Returns:
388
- (probs_4class, top_emotion, confidence)
389
- """
390
  try:
391
- # emotion2vec expects 16kHz audio
 
 
 
392
  result = self.model.generate(audio_clip, granularity="utterance")
393
 
 
 
 
394
  if not result or len(result) == 0:
395
  return None, "Neutral", 0.0
396
 
397
- # Extract scores from result
398
  scores = result[0].get("scores", [])
399
  labels = result[0].get("labels", [])
400
 
@@ -405,7 +367,7 @@ class VoiceEmotionWorker:
405
  probs_4class = np.zeros(len(FUSE4), dtype=np.float32)
406
 
407
  for label, score in zip(labels, scores):
408
- # Handle both lowercase and original case
409
  label_key = label.lower() if label.lower() in self.EMOTION_MAP else label
410
 
411
  if label_key in self.EMOTION_MAP:
@@ -420,7 +382,6 @@ class VoiceEmotionWorker:
420
  else:
421
  probs_4class = np.full(len(FUSE4), 0.25, dtype=np.float32)
422
 
423
- # Get top emotion
424
  top_idx = int(np.argmax(probs_4class))
425
  top_emotion = FUSE4[top_idx]
426
  confidence = float(probs_4class[top_idx])
@@ -428,6 +389,7 @@ class VoiceEmotionWorker:
428
  return probs_4class, top_emotion, confidence
429
 
430
  except Exception as e:
 
431
  self._log_error(f"Inference error: {e}")
432
  return None, "Neutral", 0.0
433
 
@@ -443,7 +405,7 @@ class VoiceEmotionWorker:
443
  loop_count += 1
444
  current_time = time.time()
445
 
446
- # Adaptive interval: faster when speaking, slower when idle
447
  min_interval = 0.5 if self._is_speech_active() else 1.0
448
  if current_time - last_inference_time < min_interval:
449
  time.sleep(0.05)
@@ -451,22 +413,22 @@ class VoiceEmotionWorker:
451
 
452
  last_inference_time = current_time
453
 
454
- # Heartbeat logging (every ~10 seconds)
455
  if loop_count % 200 == 0:
456
  with self.lock:
457
  emotion = self.current_label
458
  conf = self.last_confidence
459
  total = self.inference_count + self.skipped_inferences
460
  efficiency = (self.inference_count / total * 100) if total > 0 else 0
461
- print(f"[VoiceEmotion] 💓 Heartbeat: {emotion} ({conf:.2f}), active={self._is_speech_active()}, efficiency={efficiency:.1f}%")
462
 
463
- # Check pause state
464
  with self.pause_lock:
465
  if self.paused:
466
  time.sleep(0.1)
467
  continue
468
 
469
- # Skip if no speech activity
470
  if not self._is_speech_active():
471
  self.skipped_inferences += 1
472
  if not idle_logged:
@@ -485,7 +447,6 @@ class VoiceEmotionWorker:
485
  self.skipped_inferences += 1
486
  continue
487
 
488
- # Decay speech count
489
  self.speech_chunks_count = max(0, self.speech_chunks_count - 1)
490
 
491
  # Prepare audio clip
@@ -504,9 +465,8 @@ class VoiceEmotionWorker:
504
  if probs is not None:
505
  with self.lock:
506
  old_label = self.current_label
507
- old_conf = self.last_confidence
508
 
509
- # Smooth probabilities (70% old + 30% new)
510
  self.current_probs = 0.7 * self.current_probs + 0.3 * probs
511
  self.current_label = FUSE4[int(self.current_probs.argmax())]
512
  self.last_confidence = float(self.current_probs.max())
@@ -517,12 +477,12 @@ class VoiceEmotionWorker:
517
 
518
  # Log emotion changes
519
  if new_label != old_label:
520
- print(f"[VoiceEmotion] 🎭 Emotion: {old_label} → {new_label} (conf={self.last_confidence:.2f})")
521
 
522
  print("[VoiceEmotion] ⏹️ Inference loop stopped")
523
 
524
  def _start_inference_thread(self):
525
- """Start the background inference thread"""
526
  try:
527
  self.th = threading.Thread(target=self._infer_loop, daemon=True)
528
  self.th.start()
@@ -566,7 +526,6 @@ class VoiceEmotionWorker:
566
 
567
  mono = indata[:, 0] if indata.ndim > 1 else indata
568
 
569
- # Forward to Whisper if linked
570
  if self.whisper_worker is not None:
571
  try:
572
  self.whisper_worker.add_audio(mono.copy())
@@ -591,22 +550,12 @@ class VoiceEmotionWorker:
591
  pass
592
 
593
  def get_probs(self):
594
- """
595
- Get current emotion probabilities.
596
-
597
- Returns:
598
- (probs: np.array[4], label: str)
599
- """
600
  with self.lock:
601
  return self.current_probs.copy(), self.current_label
602
 
603
  def get_state(self):
604
- """
605
- Get full worker state for debugging.
606
-
607
- Returns:
608
- dict with running, speech_active, emotion info, stats
609
- """
610
  with self.lock:
611
  probs = self.current_probs.copy()
612
  label = self.current_label
 
5
  Model: iic/emotion2vec_plus_large
6
  Framework: FunASR
7
 
8
+ FIXED v2.3:
9
+ - Simplified: Downloads at runtime to /data (persistent)
10
+ - Suppressed verbose ModelScope download logs
11
+ - Clean logging output
 
12
  """
13
 
14
  import os
15
+ import sys
16
  import time
17
  import threading
18
  from collections import deque
19
  import numpy as np
20
  import webrtcvad
21
 
22
+ # Suppress verbose download logs BEFORE importing funasr
23
+ import logging
24
+ logging.getLogger('modelscope').setLevel(logging.WARNING)
25
+ logging.getLogger('funasr').setLevel(logging.WARNING)
26
+
27
  from ..config import AUDIO_SR, AUDIO_BLOCK, CLIP_SECONDS, VAD_AGGRESSIVENESS, FUSE4
28
 
29
  # ============================================================
30
+ # PERSISTENT CACHE - Use /data if available (survives rebuilds!)
31
  # ============================================================
32
+ CACHE_DIR = '/data/modelscope' if os.path.exists('/data') else '/home/user/.cache/modelscope'
33
+ os.makedirs(CACHE_DIR, exist_ok=True)
34
+
35
+ # Set cache environment variables
36
+ os.environ['MODELSCOPE_CACHE'] = CACHE_DIR
37
+ os.environ['MS_CACHE_HOME'] = CACHE_DIR
38
+ os.environ['HF_HOME'] = os.path.dirname(CACHE_DIR)
39
 
40
+ # Check if model is cached
41
+ def check_model_cached():
42
+ """Check if emotion2vec is already downloaded"""
43
+ model_paths = [
44
+ os.path.join(CACHE_DIR, 'models', 'iic', 'emotion2vec_plus_large'),
45
+ os.path.join(CACHE_DIR, 'hub', 'iic', 'emotion2vec_plus_large'),
46
  ]
47
 
48
+ for path in model_paths:
49
+ if os.path.exists(path):
50
+ model_file = os.path.join(path, 'model.pt')
51
+ if os.path.exists(model_file):
52
+ return True, path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ return False, None
 
 
55
 
56
+ is_cached, cache_path = check_model_cached()
 
 
 
57
 
58
+ if is_cached:
59
+ print(f"[VoiceEmotion] ✅ Model cached at: {cache_path}")
60
+ else:
61
+ print(f"[VoiceEmotion] 📥 First run - downloading to {CACHE_DIR} (~2 min)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  if CACHE_DIR == '/data/modelscope':
63
+ print(f"[VoiceEmotion] 💡 Using persistent storage - will only download once!")
 
 
 
 
 
 
 
 
 
64
 
65
 
66
  class VoiceEmotionWorker:
 
72
  - VAD-based silence optimization
73
  - Pause/resume for TTS coordination
74
  - Thread-safe inference
75
+ - PERSISTENT caching to /data (survives rebuilds)
 
76
  """
77
 
78
  # emotion2vec labels to our 4-class mapping
 
86
  'sad': 'Sad',
87
  'surprised': 'Neutral',
88
  'unknown': 'Neutral',
89
+ # Also handle different language variants
90
+ '生气/angry': 'Angry',
91
+ '厌恶/disgusted': 'Angry',
92
+ '恐惧/fearful': 'Sad',
93
+ '开心/happy': 'Happy',
94
+ '中立/neutral': 'Neutral',
95
+ '其他/other': 'Neutral',
96
+ '难过/sad': 'Sad',
97
+ '吃惊/surprised': 'Neutral',
98
+ # Uppercase variants
99
  'Angry': 'Angry',
100
  'Disgusted': 'Angry',
101
  'Fearful': 'Sad',
 
109
 
110
  def __init__(self, whisper_worker=None, device=None):
111
  print("\n[VoiceEmotion] Initializing emotion2vec+ Large...")
 
112
 
113
+ # Load emotion2vec model with suppressed logs
114
  try:
115
+ # Suppress ModelScope verbose output
116
+ old_stdout = sys.stdout
117
+ old_stderr = sys.stderr
118
+
119
+ # Only suppress if not cached
120
+ if not is_cached:
121
+ sys.stdout = open(os.devnull, 'w')
122
+ sys.stderr = open(os.devnull, 'w')
123
+
124
  from funasr import AutoModel
125
 
126
  load_start = time.time()
127
 
 
 
 
 
128
  self.model = AutoModel(
129
  model="iic/emotion2vec_plus_large",
130
+ disable_update=True,
131
+ hub="ms"
132
  )
133
 
134
+ # Restore output
135
+ if not is_cached:
136
+ sys.stdout.close()
137
+ sys.stderr.close()
138
+ sys.stdout = old_stdout
139
+ sys.stderr = old_stderr
140
+
141
  load_time = time.time() - load_start
 
142
 
143
+ if is_cached:
144
+ print(f"[VoiceEmotion] Loaded from cache in {load_time:.1f}s")
 
145
  else:
146
+ print(f"[VoiceEmotion] Downloaded + loaded in {load_time:.1f}s")
147
+ print(f"[VoiceEmotion] 🎉 Cached to {CACHE_DIR} - future runs will be instant!")
148
 
149
  print("[VoiceEmotion] Model: emotion2vec_plus_large (93% accuracy)")
150
 
 
153
 
154
  except ImportError:
155
  print("[VoiceEmotion] ❌ ERROR: funasr not installed")
156
+ print("[VoiceEmotion] Install with: pip install funasr")
157
  raise
158
  except Exception as e:
159
+ # Restore output on error
160
+ sys.stdout = old_stdout
161
+ sys.stderr = old_stderr
162
  print(f"[VoiceEmotion] ❌ ERROR: Failed to load model: {e}")
163
  import traceback
164
  traceback.print_exc()
 
208
  # Log configuration
209
  print(f"[VoiceEmotion] Configuration:")
210
  print(f"[VoiceEmotion] Sample rate: {AUDIO_SR} Hz")
 
211
  print(f"[VoiceEmotion] Clip length: {CLIP_SECONDS}s")
212
  print(f"[VoiceEmotion] Ring buffer: {int(CLIP_SECONDS / AUDIO_BLOCK)} chunks")
213
 
 
217
  print("[VoiceEmotion] Ready\n")
218
 
219
  def _verify_model(self):
220
+ """Verify model works with quick test inference"""
221
  try:
 
222
  test_audio = np.random.randn(AUDIO_SR).astype(np.float32) * 0.1
223
+
224
+ # Suppress test inference logs
225
+ old_stdout = sys.stdout
226
+ sys.stdout = open(os.devnull, 'w')
227
+
228
  result = self.model.generate(test_audio, granularity="utterance")
229
 
230
+ sys.stdout.close()
231
+ sys.stdout = old_stdout
232
+
233
  if result and len(result) > 0:
234
  scores = result[0].get("scores", [])
235
  labels = result[0].get("labels", [])
236
 
237
+ if scores and labels and len(set(scores)) > 1:
238
+ print(f"[VoiceEmotion] Model verified ({len(labels)} emotion classes)")
 
 
 
 
 
 
 
 
239
  else:
240
+ print("[VoiceEmotion] ⚠️ Model loaded but may have issues")
241
+
 
 
242
  except Exception as e:
243
+ sys.stdout = old_stdout
244
+ print(f"[VoiceEmotion] ⚠️ Verification failed: {e}")
245
 
246
  def pause_listening(self):
247
  """Pause audio processing (called when TTS starts)"""
 
253
 
254
  def resume_listening(self):
255
  """Resume audio processing (called when TTS finishes)"""
 
256
  whisper_cleared = 0
257
  if self.whisper_worker:
258
  try:
 
262
  except Exception as e:
263
  self._log_error(f"Error clearing Whisper buffer: {e}")
264
 
 
265
  emotion_cleared = len(self.ring)
266
  self.ring.clear()
267
  self.speech_chunks_count = 0
 
273
  print(f"[VoiceEmotion] ▶️ RESUMED (cleared {whisper_cleared + emotion_cleared} chunks)")
274
 
275
  def _log_error(self, message):
276
+ """Log errors with rate limiting"""
277
  current_time = time.time()
278
  if message != self.last_error_message or current_time - self.last_error_time > 5.0:
279
  print(f"[VoiceEmotion] ⚠️ {message}")
 
281
  self.last_error_time = current_time
282
 
283
  def add_audio(self, audio_data):
284
+ """Add audio data for processing"""
 
 
 
 
 
 
285
  if not self.running:
286
+ print("[VoiceEmotion] Auto-starting inference thread")
287
  self.running = True
288
  self._start_inference_thread()
289
 
 
292
  except Exception as e:
293
  self._log_error(f"Audio processing error: {e}")
294
 
295
+ process_external_audio = add_audio # Alias
 
296
 
297
  def _process_audio_chunk(self, mono_data):
298
  """Process incoming audio chunk with VAD"""
 
343
  return chunks
344
 
345
  def _run_inference(self, audio_clip):
346
+ """Run emotion2vec inference (with suppressed logs)"""
 
 
 
 
 
 
 
 
347
  try:
348
+ # Suppress inference logs
349
+ old_stdout = sys.stdout
350
+ sys.stdout = open(os.devnull, 'w')
351
+
352
  result = self.model.generate(audio_clip, granularity="utterance")
353
 
354
+ sys.stdout.close()
355
+ sys.stdout = old_stdout
356
+
357
  if not result or len(result) == 0:
358
  return None, "Neutral", 0.0
359
 
 
360
  scores = result[0].get("scores", [])
361
  labels = result[0].get("labels", [])
362
 
 
367
  probs_4class = np.zeros(len(FUSE4), dtype=np.float32)
368
 
369
  for label, score in zip(labels, scores):
370
+ # Try lowercase first, then original
371
  label_key = label.lower() if label.lower() in self.EMOTION_MAP else label
372
 
373
  if label_key in self.EMOTION_MAP:
 
382
  else:
383
  probs_4class = np.full(len(FUSE4), 0.25, dtype=np.float32)
384
 
 
385
  top_idx = int(np.argmax(probs_4class))
386
  top_emotion = FUSE4[top_idx]
387
  confidence = float(probs_4class[top_idx])
 
389
  return probs_4class, top_emotion, confidence
390
 
391
  except Exception as e:
392
+ sys.stdout = old_stdout
393
  self._log_error(f"Inference error: {e}")
394
  return None, "Neutral", 0.0
395
 
 
405
  loop_count += 1
406
  current_time = time.time()
407
 
408
+ # Adaptive interval
409
  min_interval = 0.5 if self._is_speech_active() else 1.0
410
  if current_time - last_inference_time < min_interval:
411
  time.sleep(0.05)
 
413
 
414
  last_inference_time = current_time
415
 
416
+ # Heartbeat (every ~10 seconds)
417
  if loop_count % 200 == 0:
418
  with self.lock:
419
  emotion = self.current_label
420
  conf = self.last_confidence
421
  total = self.inference_count + self.skipped_inferences
422
  efficiency = (self.inference_count / total * 100) if total > 0 else 0
423
+ print(f"[VoiceEmotion] 💓 Heartbeat: {emotion} ({conf:.2f}), efficiency={efficiency:.1f}%")
424
 
425
+ # Check pause
426
  with self.pause_lock:
427
  if self.paused:
428
  time.sleep(0.1)
429
  continue
430
 
431
+ # Skip if no speech
432
  if not self._is_speech_active():
433
  self.skipped_inferences += 1
434
  if not idle_logged:
 
447
  self.skipped_inferences += 1
448
  continue
449
 
 
450
  self.speech_chunks_count = max(0, self.speech_chunks_count - 1)
451
 
452
  # Prepare audio clip
 
465
  if probs is not None:
466
  with self.lock:
467
  old_label = self.current_label
 
468
 
469
+ # Smooth probabilities
470
  self.current_probs = 0.7 * self.current_probs + 0.3 * probs
471
  self.current_label = FUSE4[int(self.current_probs.argmax())]
472
  self.last_confidence = float(self.current_probs.max())
 
477
 
478
  # Log emotion changes
479
  if new_label != old_label:
480
+ print(f"[VoiceEmotion] 🎭 {old_label} → {new_label} (conf={self.last_confidence:.2f})")
481
 
482
  print("[VoiceEmotion] ⏹️ Inference loop stopped")
483
 
484
  def _start_inference_thread(self):
485
+ """Start background inference thread"""
486
  try:
487
  self.th = threading.Thread(target=self._infer_loop, daemon=True)
488
  self.th.start()
 
526
 
527
  mono = indata[:, 0] if indata.ndim > 1 else indata
528
 
 
529
  if self.whisper_worker is not None:
530
  try:
531
  self.whisper_worker.add_audio(mono.copy())
 
550
  pass
551
 
552
  def get_probs(self):
553
+ """Get current emotion probabilities"""
 
 
 
 
 
554
  with self.lock:
555
  return self.current_probs.copy(), self.current_label
556
 
557
  def get_state(self):
558
+ """Get worker state for debugging"""
 
 
 
 
 
559
  with self.lock:
560
  probs = self.current_probs.copy()
561
  label = self.current_label