michon commited on
Commit
cc5d3fd
·
1 Parent(s): 151d024

Update 2026-19-03 11:13

Browse files
avatar-frontend/app/app/page.tsx CHANGED
@@ -23,6 +23,23 @@ interface AssistantAudioPayload {
23
  audio_url: string;
24
  }
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  interface SpeechRecognitionAlternativeLike {
27
  transcript: string;
28
  }
@@ -533,6 +550,13 @@ export default function AppPage() {
533
 
534
  const audioContextRef = useRef<AudioContext | null>(null);
535
  const audioProcessorRef = useRef<ScriptProcessorNode | null>(null);
 
 
 
 
 
 
 
536
 
537
  const selectedVoiceRef = useRef(selectedVoice);
538
  const selectedPersonalityRef = useRef(selectedPersonality);
@@ -603,6 +627,29 @@ export default function AppPage() {
603
  }
604
  audioProcessorRef.current = null;
605
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  if (wsRef.current) {
607
  try {
608
  wsRef.current.close(1000, "Session ended");
@@ -696,6 +743,104 @@ export default function AppPage() {
696
  }
697
  };
698
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
  function connectWebSocket() {
700
  const token = localStorage.getItem("mrrrme_token");
701
  if (!token) {
@@ -745,6 +890,24 @@ export default function AppPage() {
745
  },
746
  ]);
747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
  if (data.text_only || !data.audio_url) {
749
  setStatus("Response ready");
750
  setTimeout(() => {
@@ -770,6 +933,23 @@ export default function AppPage() {
770
  }
771
  }
772
  await playAssistantResponse(data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773
  }
774
  };
775
  ws.onerror = () => setStatus("Connection error");
@@ -948,6 +1128,9 @@ export default function AppPage() {
948
  async function playAssistantResponse(data: AssistantAudioPayload) {
949
  if (!audioRef.current) return;
950
 
 
 
 
951
  if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
952
  wsRef.current.send(JSON.stringify({ type: "tts_start" }));
953
  }
@@ -981,18 +1164,7 @@ export default function AppPage() {
981
  setStatus("Audio error");
982
  }
983
 
984
- setIsResponsePlaying(false);
985
- setStatus(isPausedRef.current ? "Paused" : "Listening...");
986
- shouldAutoRestartRef.current = true;
987
- if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
988
- wsRef.current.send(JSON.stringify({ type: "tts_end" }));
989
- }
990
- if (recognitionRef.current && !isPausedRef.current) {
991
- try {
992
- recognitionRef.current.start();
993
- } catch {
994
- }
995
- }
996
  }
997
 
998
  return (
 
23
  audio_url: string;
24
  }
25
 
26
+ interface AssistantStreamPayload {
27
+ stream_id: string;
28
+ audio_stream?: boolean;
29
+ text_only?: boolean;
30
+ audio_url?: string;
31
+ }
32
+
33
+ interface TtsStreamStartPayload {
34
+ stream_id: string;
35
+ sample_rate: number;
36
+ }
37
+
38
+ interface TtsStreamChunkPayload {
39
+ stream_id: string;
40
+ audio: string;
41
+ }
42
+
43
  interface SpeechRecognitionAlternativeLike {
44
  transcript: string;
45
  }
 
550
 
551
  const audioContextRef = useRef<AudioContext | null>(null);
552
  const audioProcessorRef = useRef<ScriptProcessorNode | null>(null);
553
+ const ttsPlaybackContextRef = useRef<AudioContext | null>(null);
554
+ const ttsPlaybackTimeRef = useRef(0);
555
+ const ttsPendingSourcesRef = useRef(0);
556
+ const ttsPlaybackEndedRef = useRef(false);
557
+ const ttsStreamIdRef = useRef<string | null>(null);
558
+ const ttsStreamSampleRateRef = useRef<number>(24000);
559
+ const ttsSourceNodesRef = useRef<AudioBufferSourceNode[]>([]);
560
 
561
  const selectedVoiceRef = useRef(selectedVoice);
562
  const selectedPersonalityRef = useRef(selectedPersonality);
 
627
  }
628
  audioProcessorRef.current = null;
629
  }
630
+ if (ttsPlaybackContextRef.current) {
631
+ try {
632
+ ttsPlaybackContextRef.current.close();
633
+ } catch {
634
+ }
635
+ ttsPlaybackContextRef.current = null;
636
+ }
637
+ ttsSourceNodesRef.current.forEach((source) => {
638
+ try {
639
+ source.stop();
640
+ } catch {
641
+ }
642
+ try {
643
+ source.disconnect();
644
+ } catch {
645
+ }
646
+ });
647
+ ttsSourceNodesRef.current = [];
648
+ ttsPlaybackTimeRef.current = 0;
649
+ ttsPendingSourcesRef.current = 0;
650
+ ttsPlaybackEndedRef.current = false;
651
+ ttsStreamIdRef.current = null;
652
+ ttsStreamSampleRateRef.current = 24000;
653
  if (wsRef.current) {
654
  try {
655
  wsRef.current.close(1000, "Session ended");
 
743
  }
744
  };
745
 
746
+ const finishAssistantPlayback = () => {
747
+ setIsResponsePlaying(false);
748
+ setStatus(isPausedRef.current ? "Paused" : "Listening...");
749
+ shouldAutoRestartRef.current = true;
750
+ if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
751
+ wsRef.current.send(JSON.stringify({ type: "tts_end" }));
752
+ }
753
+ if (recognitionRef.current && !isPausedRef.current) {
754
+ try {
755
+ recognitionRef.current.start();
756
+ } catch {
757
+ }
758
+ }
759
+ };
760
+
761
+ const resetStreamingPlayback = () => {
762
+ if (audioRef.current) {
763
+ audioRef.current.pause();
764
+ audioRef.current.removeAttribute("src");
765
+ audioRef.current.load();
766
+ }
767
+ ttsSourceNodesRef.current.forEach((source) => {
768
+ try {
769
+ source.stop();
770
+ } catch {
771
+ }
772
+ try {
773
+ source.disconnect();
774
+ } catch {
775
+ }
776
+ });
777
+ ttsSourceNodesRef.current = [];
778
+ ttsPlaybackTimeRef.current = 0;
779
+ ttsPendingSourcesRef.current = 0;
780
+ ttsPlaybackEndedRef.current = false;
781
+ ttsStreamSampleRateRef.current = 24000;
782
+ };
783
+
784
+ const ensureTtsPlaybackContext = async () => {
785
+ if (!ttsPlaybackContextRef.current) {
786
+ ttsPlaybackContextRef.current = new AudioContext();
787
+ }
788
+ if (ttsPlaybackContextRef.current.state === "suspended") {
789
+ await ttsPlaybackContextRef.current.resume();
790
+ }
791
+ return ttsPlaybackContextRef.current;
792
+ };
793
+
794
+ const finalizeStreamIfComplete = () => {
795
+ if (!ttsPlaybackEndedRef.current || ttsPendingSourcesRef.current > 0) return;
796
+ ttsStreamIdRef.current = null;
797
+ finishAssistantPlayback();
798
+ };
799
+
800
+ const startStreamingPlayback = async (data: TtsStreamStartPayload) => {
801
+ if (ttsStreamIdRef.current !== data.stream_id) return;
802
+ const playbackContext = await ensureTtsPlaybackContext();
803
+ ttsStreamSampleRateRef.current = data.sample_rate;
804
+ ttsPlaybackTimeRef.current = Math.max(ttsPlaybackTimeRef.current, playbackContext.currentTime + 0.08);
805
+ setStatus("Speaking...");
806
+ };
807
+
808
+ const enqueueStreamingAudioChunk = async (data: TtsStreamChunkPayload) => {
809
+ if (ttsStreamIdRef.current !== data.stream_id) return;
810
+ const playbackContext = await ensureTtsPlaybackContext();
811
+ const binary = atob(data.audio);
812
+ const bytes = Uint8Array.from(binary, (char) => char.charCodeAt(0));
813
+ const pcm = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
814
+ const floatSamples = new Float32Array(pcm.length);
815
+ for (let i = 0; i < pcm.length; i++) {
816
+ floatSamples[i] = pcm[i] / 32768;
817
+ }
818
+
819
+ const audioBuffer = playbackContext.createBuffer(1, floatSamples.length, ttsStreamSampleRateRef.current);
820
+ audioBuffer.copyToChannel(floatSamples, 0);
821
+
822
+ const source = playbackContext.createBufferSource();
823
+ source.buffer = audioBuffer;
824
+ source.connect(playbackContext.destination);
825
+
826
+ const startAt = Math.max(ttsPlaybackTimeRef.current, playbackContext.currentTime + 0.05);
827
+ ttsPlaybackTimeRef.current = startAt + audioBuffer.duration;
828
+ ttsPendingSourcesRef.current += 1;
829
+ ttsSourceNodesRef.current.push(source);
830
+
831
+ source.onended = () => {
832
+ ttsPendingSourcesRef.current = Math.max(0, ttsPendingSourcesRef.current - 1);
833
+ ttsSourceNodesRef.current = ttsSourceNodesRef.current.filter((node) => node !== source);
834
+ try {
835
+ source.disconnect();
836
+ } catch {
837
+ }
838
+ finalizeStreamIfComplete();
839
+ };
840
+
841
+ source.start(startAt);
842
+ };
843
+
844
  function connectWebSocket() {
845
  const token = localStorage.getItem("mrrrme_token");
846
  if (!token) {
 
890
  },
891
  ]);
892
 
893
+ if (data.audio_stream && data.stream_id) {
894
+ resetStreamingPlayback();
895
+ ttsStreamIdRef.current = (data as AssistantStreamPayload).stream_id;
896
+ setStatus("Preparing audio...");
897
+ setIsResponsePlaying(true);
898
+ shouldAutoRestartRef.current = false;
899
+ if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
900
+ wsRef.current.send(JSON.stringify({ type: "tts_start" }));
901
+ }
902
+ if (recognitionRef.current) {
903
+ try {
904
+ recognitionRef.current.stop();
905
+ } catch {
906
+ }
907
+ }
908
+ return;
909
+ }
910
+
911
  if (data.text_only || !data.audio_url) {
912
  setStatus("Response ready");
913
  setTimeout(() => {
 
933
  }
934
  }
935
  await playAssistantResponse(data);
936
+ } else if (data.type === "tts_stream_start") {
937
+ await startStreamingPlayback(data as TtsStreamStartPayload);
938
+ } else if (data.type === "tts_stream_chunk") {
939
+ await enqueueStreamingAudioChunk(data as TtsStreamChunkPayload);
940
+ } else if (data.type === "tts_stream_end") {
941
+ if (ttsStreamIdRef.current === data.stream_id) {
942
+ ttsPlaybackEndedRef.current = true;
943
+ finalizeStreamIfComplete();
944
+ }
945
+ } else if (data.type === "tts_stream_error") {
946
+ if (!ttsStreamIdRef.current || ttsStreamIdRef.current === data.stream_id) {
947
+ console.error("[Audio] Stream error:", data.message);
948
+ setStatus("Audio error");
949
+ ttsStreamIdRef.current = null;
950
+ resetStreamingPlayback();
951
+ finishAssistantPlayback();
952
+ }
953
  }
954
  };
955
  ws.onerror = () => setStatus("Connection error");
 
1128
  async function playAssistantResponse(data: AssistantAudioPayload) {
1129
  if (!audioRef.current) return;
1130
 
1131
+ resetStreamingPlayback();
1132
+ ttsStreamIdRef.current = null;
1133
+
1134
  if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
1135
  wsRef.current.send(JSON.stringify({ type: "tts_start" }));
1136
  }
 
1164
  setStatus("Audio error");
1165
  }
1166
 
1167
+ finishAssistantPlayback();
 
 
 
 
 
 
 
 
 
 
 
1168
  }
1169
 
1170
  return (
avatar/speak_server.py CHANGED
@@ -1,21 +1,19 @@
1
- """Avatar Backend - Qwen 3 TTS with STREAMING (Ultra-Fast 97ms latency!)"""
2
- import os
3
- import uuid
4
- import time
5
- import wave
6
- import subprocess
7
- import json as json_lib
8
- import asyncio
9
- from fastapi import FastAPI, Form, WebSocket
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.staticfiles import StaticFiles
12
- from fastapi.responses import JSONResponse, StreamingResponse
13
- from pydub import AudioSegment
14
- from typing import List, Optional
15
- from dotenv import load_dotenv
16
- import torch
17
- import numpy as np
18
- import io
19
 
20
  load_dotenv()
21
 
@@ -133,20 +131,121 @@ async def websocket_endpoint(websocket: WebSocket):
133
  active_connections.remove(websocket)
134
 
135
 
136
- async def broadcast_to_avatars(data: dict):
137
- for connection in active_connections[:]:
138
- try:
139
- await connection.send_json(data)
140
- except:
141
- if connection in active_connections:
142
- active_connections.remove(connection)
143
-
144
-
145
- # ============ FAST TTS ENDPOINT ============
146
-
147
- @app.post("/speak")
148
- async def speak(text: str = Form(...), voice: str = Form("female"), language: str = Form("en")):
149
- """Generate TTS quickly with optimized settings"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  # Quick model check
152
  if qwen_tts_model is None or not MODEL_READY:
@@ -156,78 +255,68 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
156
  content={"error": "TTS model still loading, please wait..."}
157
  )
158
 
159
- t_start = time.time()
160
- uid = uuid.uuid4().hex[:8]
161
- wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
162
- mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
163
-
164
- language_map = {
165
- "en": "English", "nl": "Auto", "zh": "Chinese", "ja": "Japanese",
166
- "ko": "Korean", "de": "German", "fr": "French", "ru": "Russian",
167
- "pt": "Portuguese", "es": "Spanish", "it": "Italian"
168
- }
169
-
170
- qwen_language = language_map.get(language, "English")
171
- speaker_name = VOICE_MAP.get(voice, "Serena")
172
-
173
- print(f"[TTS] 🎤 {speaker_name}: '{text[:40]}...'")
174
-
175
- try:
176
- # ✅ OPTIMIZED: Use faster generation with minimal settings
177
- with torch.no_grad(): # Disable gradients for speed
178
- wavs, sample_rate = qwen_tts_model.generate_custom_voice(
179
- text=text,
180
- language=qwen_language,
181
- speaker=speaker_name,
182
- # Performance optimizations:
183
- max_new_tokens=1024, # Limit length for speed
184
- # Note: Streaming mode would be even faster but requires different API
185
- )
186
-
187
- # Quick extraction
188
- audio_array = wavs[0] if isinstance(wavs, list) else wavs
189
- if isinstance(audio_array, torch.Tensor):
190
- audio_array = audio_array.cpu().numpy()
191
- audio_array = audio_array.squeeze()
192
-
193
- # Convert to int16
194
- if audio_array.max() <= 1.0:
195
- audio_array = (audio_array * 32767).astype(np.int16)
196
- else:
197
- audio_array = audio_array.astype(np.int16)
198
-
199
- # Save WAV (skip MP3 conversion for speed)
200
- import scipy.io.wavfile
201
- scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_array)
202
-
203
- duration_sec = len(audio_array) / sample_rate
204
-
205
- t_gen = time.time() - t_start
206
- print(f"[TTS] ⚡ Generated in {t_gen:.2f}s ({duration_sec:.1f}s audio)")
207
-
208
- # Fast viseme generation (skip Rhubarb for speed)
209
- visemes = generate_visemes_fast(text, duration_sec)
210
-
211
- response_data = {
212
- "audio_url": f"/static/{os.path.basename(wav_path)}",
213
- "visemes": visemes,
214
- "duration": duration_sec,
215
- "text": text,
216
- "method": "fast_phoneme",
217
- "generation_time": round(t_gen, 2)
218
- }
219
-
220
- await broadcast_to_avatars(response_data)
221
-
222
- print(f"[TTS] ✅ Total: {time.time() - t_start:.2f}s\n")
223
-
224
- return response_data
225
-
226
- except Exception as e:
227
- print(f"[TTS] ❌ Error: {e}")
228
- import traceback
229
- traceback.print_exc()
230
- return JSONResponse(status_code=500, content={"error": str(e)})
231
 
232
 
233
  @app.get("/health")
@@ -261,4 +350,4 @@ if __name__ == "__main__":
261
  print("Optimization: torch.no_grad() + fast visemes")
262
  print("Expected latency: 1-3 seconds (vs 5-10s before)")
263
  print("="*60 + "\n")
264
- uvicorn.run(app, host="0.0.0.0", port=8765)
 
1
+ """Avatar Backend - Qwen 3 TTS with chunked PCM streaming."""
2
+ import base64
3
+ import os
4
+ import uuid
5
+ import time
6
+ import subprocess
7
+ import json as json_lib
8
+ import asyncio
9
+ from fastapi import FastAPI, Form, WebSocket
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from fastapi.staticfiles import StaticFiles
12
+ from fastapi.responses import JSONResponse, StreamingResponse
13
+ from typing import List, Optional
14
+ from dotenv import load_dotenv
15
+ import torch
16
+ import numpy as np
 
 
17
 
18
  load_dotenv()
19
 
 
131
  active_connections.remove(websocket)
132
 
133
 
134
+ async def broadcast_to_avatars(data: dict):
135
+ for connection in active_connections[:]:
136
+ try:
137
+ await connection.send_json(data)
138
+ except:
139
+ if connection in active_connections:
140
+ active_connections.remove(connection)
141
+
142
+
143
+ def synthesize_speech(
144
+ text: str,
145
+ voice: str,
146
+ language: str,
147
+ request_id: Optional[str] = None,
148
+ ) -> dict:
149
+ """Run TTS once and return reusable response metadata."""
150
+ t_start = time.time()
151
+ uid = request_id or uuid.uuid4().hex[:8]
152
+ wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
153
+
154
+ language_map = {
155
+ "en": "English", "nl": "Auto", "zh": "Chinese", "ja": "Japanese",
156
+ "ko": "Korean", "de": "German", "fr": "French", "ru": "Russian",
157
+ "pt": "Portuguese", "es": "Spanish", "it": "Italian"
158
+ }
159
+
160
+ qwen_language = language_map.get(language, "English")
161
+ speaker_name = VOICE_MAP.get(voice, "Serena")
162
+
163
+ print(f"[TTS] 🎤 {speaker_name}: '{text[:40]}...'")
164
+
165
+ with torch.no_grad():
166
+ wavs, sample_rate = qwen_tts_model.generate_custom_voice(
167
+ text=text,
168
+ language=qwen_language,
169
+ speaker=speaker_name,
170
+ max_new_tokens=1024,
171
+ )
172
+
173
+ audio_array = wavs[0] if isinstance(wavs, list) else wavs
174
+ if isinstance(audio_array, torch.Tensor):
175
+ audio_array = audio_array.cpu().numpy()
176
+ audio_array = audio_array.squeeze()
177
+
178
+ if audio_array.max() <= 1.0:
179
+ audio_array = (audio_array * 32767).astype(np.int16)
180
+ else:
181
+ audio_array = audio_array.astype(np.int16)
182
+
183
+ import scipy.io.wavfile
184
+ scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_array)
185
+
186
+ duration_sec = len(audio_array) / sample_rate
187
+ generation_time = time.time() - t_start
188
+ print(f"[TTS] ⚡ Generated in {generation_time:.2f}s ({duration_sec:.1f}s audio)")
189
+
190
+ visemes = generate_visemes_fast(text, duration_sec)
191
+
192
+ return {
193
+ "stream_id": uid,
194
+ "audio_array": audio_array,
195
+ "sample_rate": sample_rate,
196
+ "wav_path": wav_path,
197
+ "visemes": visemes,
198
+ "duration": duration_sec,
199
+ "text": text,
200
+ "method": "fast_phoneme",
201
+ "generation_time": round(generation_time, 2),
202
+ }
203
+
204
+
205
+ def iter_ndjson_stream(payload: dict, chunk_duration_ms: int = 120):
206
+ """Yield newline-delimited JSON stream frames with PCM16 audio chunks."""
207
+ audio_bytes = payload["audio_array"].tobytes()
208
+ sample_rate = int(payload["sample_rate"])
209
+ samples_per_chunk = max(1, int(sample_rate * (chunk_duration_ms / 1000.0)))
210
+ chunk_size_bytes = samples_per_chunk * 2 # mono PCM16
211
+
212
+ start_event = {
213
+ "type": "start",
214
+ "stream_id": payload["stream_id"],
215
+ "sample_rate": sample_rate,
216
+ "channels": 1,
217
+ "duration": payload["duration"],
218
+ "generation_time": payload["generation_time"],
219
+ }
220
+ yield f"{json_lib.dumps(start_event)}\n"
221
+
222
+ for offset in range(0, len(audio_bytes), chunk_size_bytes):
223
+ chunk = audio_bytes[offset: offset + chunk_size_bytes]
224
+ chunk_event = {
225
+ "type": "chunk",
226
+ "stream_id": payload["stream_id"],
227
+ "audio": base64.b64encode(chunk).decode("ascii"),
228
+ }
229
+ yield f"{json_lib.dumps(chunk_event)}\n"
230
+
231
+ end_event = {
232
+ "type": "end",
233
+ "stream_id": payload["stream_id"],
234
+ "duration": payload["duration"],
235
+ }
236
+ yield f"{json_lib.dumps(end_event)}\n"
237
+
238
+
239
+ # ============ FAST TTS ENDPOINTS ============
240
+
241
+ @app.post("/speak")
242
+ async def speak(
243
+ text: str = Form(...),
244
+ voice: str = Form("female"),
245
+ language: str = Form("en"),
246
+ request_id: Optional[str] = Form(None),
247
+ ):
248
+ """Generate TTS quickly with optimized settings"""
249
 
250
  # Quick model check
251
  if qwen_tts_model is None or not MODEL_READY:
 
255
  content={"error": "TTS model still loading, please wait..."}
256
  )
257
 
258
+ try:
259
+ payload = synthesize_speech(text=text, voice=voice, language=language, request_id=request_id)
260
+ response_data = {
261
+ "stream_id": payload["stream_id"],
262
+ "audio_url": f"/static/{os.path.basename(payload['wav_path'])}",
263
+ "visemes": payload["visemes"],
264
+ "duration": payload["duration"],
265
+ "text": payload["text"],
266
+ "method": payload["method"],
267
+ "generation_time": payload["generation_time"],
268
+ }
269
+
270
+ await broadcast_to_avatars(response_data)
271
+
272
+ print(f"[TTS] Total: {payload['generation_time']:.2f}s\n")
273
+
274
+ return response_data
275
+
276
+ except Exception as e:
277
+ print(f"[TTS] Error: {e}")
278
+ import traceback
279
+ traceback.print_exc()
280
+ return JSONResponse(status_code=500, content={"error": str(e)})
281
+
282
+
283
+ @app.post("/speak_stream")
284
+ async def speak_stream(
285
+ text: str = Form(...),
286
+ voice: str = Form("female"),
287
+ language: str = Form("en"),
288
+ request_id: Optional[str] = Form(None),
289
+ ):
290
+ """Stream synthesized PCM chunks as newline-delimited JSON."""
291
+ if qwen_tts_model is None or not MODEL_READY:
292
+ return JSONResponse(
293
+ status_code=503,
294
+ content={"error": "TTS model still loading, please wait..."},
295
+ )
296
+
297
+ try:
298
+ payload = synthesize_speech(text=text, voice=voice, language=language, request_id=request_id)
299
+ response_data = {
300
+ "stream_id": payload["stream_id"],
301
+ "audio_url": f"/static/{os.path.basename(payload['wav_path'])}",
302
+ "visemes": payload["visemes"],
303
+ "duration": payload["duration"],
304
+ "text": payload["text"],
305
+ "method": payload["method"],
306
+ "generation_time": payload["generation_time"],
307
+ }
308
+ await broadcast_to_avatars(response_data)
309
+
310
+ return StreamingResponse(
311
+ iter_ndjson_stream(payload),
312
+ media_type="application/x-ndjson",
313
+ headers={"Cache-Control": "no-cache"},
314
+ )
315
+ except Exception as e:
316
+ print(f"[TTS] Stream error: {e}")
317
+ import traceback
318
+ traceback.print_exc()
319
+ return JSONResponse(status_code=500, content={"error": str(e)})
 
 
 
 
 
 
 
 
 
 
320
 
321
 
322
  @app.get("/health")
 
350
  print("Optimization: torch.no_grad() + fast visemes")
351
  print("Expected latency: 1-3 seconds (vs 5-10s before)")
352
  print("="*60 + "\n")
353
+ uvicorn.run(app, host="0.0.0.0", port=8765)
mrrrme/backend/processing/speech.py CHANGED
@@ -20,6 +20,7 @@ import requests
20
  import numpy as np
21
  import secrets
22
  import json
 
23
  from datetime import datetime
24
  from typing import Optional, Dict
25
  from ..models.loader import get_models
@@ -71,6 +72,86 @@ def filter_transcription(transcription: str) -> tuple:
71
  return True, None
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def save_emotion_prediction(
75
  message_id: str,
76
  session_id: str,
@@ -533,40 +614,34 @@ async def process_speech_end(
533
 
534
  save_message(session_id, "assistant", response_text, fused_emotion, user_id)
535
 
536
- # ========== SEND TO AVATAR ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  try:
538
- avatar_response = requests.post(
539
- f"{AVATAR_API}/speak",
540
- data={
541
- "text": response_text,
542
- "voice": user_preferences.get("voice", "female"),
543
- "language": user_preferences.get("language", "en")
544
- },
545
- timeout=45
546
  )
547
- avatar_response.raise_for_status()
548
- avatar_data = avatar_response.json()
549
-
550
- await websocket.send_json({
551
- "type": "llm_response",
552
- "text": response_text,
553
- "emotion": fused_emotion,
554
- "intensity": intensity,
555
- "is_masking": is_masking,
556
- "masking_type": masking_type,
557
- "audio_url": avatar_data.get("audio_url"),
558
- "visemes": avatar_data.get("visemes"),
559
- "frames_captured": frame_count # NEW: Include frame count
560
- })
561
  except Exception as avatar_err:
562
  safe_print(f"[TTS] Error: {avatar_err}")
563
  await websocket.send_json({
564
- "type": "llm_response",
565
- "text": response_text,
566
- "emotion": fused_emotion,
567
- "intensity": intensity,
568
- "is_masking": is_masking,
569
- "text_only": True
570
  })
571
 
572
  safe_print(f"[Pipeline] Complete for {username}")
@@ -575,4 +650,4 @@ async def process_speech_end(
575
  except Exception as e:
576
  safe_print(f"[Pipeline] Error: {e}")
577
  import traceback
578
- traceback.print_exc()
 
20
  import numpy as np
21
  import secrets
22
  import json
23
+ import asyncio
24
  from datetime import datetime
25
  from typing import Optional, Dict
26
  from ..models.loader import get_models
 
72
  return True, None
73
 
74
 
75
+ async def stream_tts_audio_to_websocket(
76
+ websocket,
77
+ text: str,
78
+ voice: str,
79
+ language: str,
80
+ stream_id: str,
81
+ ) -> None:
82
+ """Proxy chunked PCM audio from the TTS service to the browser websocket."""
83
+ loop = asyncio.get_running_loop()
84
+ event_queue: asyncio.Queue[tuple[str, Optional[dict], Optional[str]]] = asyncio.Queue()
85
+
86
+ def read_stream() -> None:
87
+ try:
88
+ with requests.post(
89
+ f"{AVATAR_API}/speak_stream",
90
+ data={
91
+ "text": text,
92
+ "voice": voice,
93
+ "language": language,
94
+ "request_id": stream_id,
95
+ },
96
+ stream=True,
97
+ timeout=(10, 120),
98
+ ) as response:
99
+ response.raise_for_status()
100
+ for raw_line in response.iter_lines(decode_unicode=True):
101
+ if not raw_line:
102
+ continue
103
+ event = json.loads(raw_line)
104
+ asyncio.run_coroutine_threadsafe(
105
+ event_queue.put(("event", event, None)),
106
+ loop,
107
+ ).result()
108
+ except Exception as exc:
109
+ asyncio.run_coroutine_threadsafe(
110
+ event_queue.put(("error", None, str(exc))),
111
+ loop,
112
+ ).result()
113
+ finally:
114
+ asyncio.run_coroutine_threadsafe(
115
+ event_queue.put(("done", None, None)),
116
+ loop,
117
+ ).result()
118
+
119
+ reader_task = asyncio.create_task(asyncio.to_thread(read_stream))
120
+
121
+ try:
122
+ while True:
123
+ event_type, event, error_message = await event_queue.get()
124
+ if event_type == "event" and event is not None:
125
+ upstream_type = event.get("type")
126
+ if upstream_type == "start":
127
+ await websocket.send_json({
128
+ "type": "tts_stream_start",
129
+ "stream_id": stream_id,
130
+ "sample_rate": event.get("sample_rate"),
131
+ "channels": event.get("channels", 1),
132
+ "duration": event.get("duration"),
133
+ "generation_time": event.get("generation_time"),
134
+ })
135
+ elif upstream_type == "chunk":
136
+ await websocket.send_json({
137
+ "type": "tts_stream_chunk",
138
+ "stream_id": stream_id,
139
+ "audio": event.get("audio"),
140
+ })
141
+ elif upstream_type == "end":
142
+ await websocket.send_json({
143
+ "type": "tts_stream_end",
144
+ "stream_id": stream_id,
145
+ "duration": event.get("duration"),
146
+ })
147
+ elif event_type == "error":
148
+ raise RuntimeError(error_message or "Unknown TTS stream error")
149
+ elif event_type == "done":
150
+ break
151
+ finally:
152
+ await reader_task
153
+
154
+
155
  def save_emotion_prediction(
156
  message_id: str,
157
  session_id: str,
 
614
 
615
  save_message(session_id, "assistant", response_text, fused_emotion, user_id)
616
 
617
+ # ========== SEND TEXT IMMEDIATELY, THEN STREAM TTS ==========
618
+ stream_id = secrets.token_urlsafe(8)
619
+ await websocket.send_json({
620
+ "type": "llm_response",
621
+ "text": response_text,
622
+ "emotion": fused_emotion,
623
+ "intensity": intensity,
624
+ "is_masking": is_masking,
625
+ "masking_type": masking_type,
626
+ "frames_captured": frame_count,
627
+ "audio_stream": True,
628
+ "stream_id": stream_id,
629
+ })
630
+
631
  try:
632
+ await stream_tts_audio_to_websocket(
633
+ websocket=websocket,
634
+ text=response_text,
635
+ voice=user_preferences.get("voice", "female"),
636
+ language=user_preferences.get("language", "en"),
637
+ stream_id=stream_id,
 
 
638
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  except Exception as avatar_err:
640
  safe_print(f"[TTS] Error: {avatar_err}")
641
  await websocket.send_json({
642
+ "type": "tts_stream_error",
643
+ "stream_id": stream_id,
644
+ "message": str(avatar_err),
 
 
 
645
  })
646
 
647
  safe_print(f"[Pipeline] Complete for {username}")
 
650
  except Exception as e:
651
  safe_print(f"[Pipeline] Error: {e}")
652
  import traceback
653
+ traceback.print_exc()
mrrrme/backend/websocket.py CHANGED
@@ -16,19 +16,18 @@ from fastapi import WebSocket, WebSocketDisconnect
16
  from starlette.websockets import WebSocketState
17
  import asyncio
18
  import base64
 
19
  import numpy as np
20
  import cv2
21
  import io
22
  from PIL import Image
23
- import requests
24
 
25
  from . import models as models_module
26
  from .session.manager import validate_token, save_message, load_user_history
27
  from .session.summary import generate_session_summary
28
  from .auth.database import get_db_connection
29
- from .utils.helpers import get_avatar_api_url
30
  from .config import GREETINGS
31
- from .processing.speech import process_speech_end
32
  from .processing.face_emotion_aggregator import FaceEmotionAggregator
33
 
34
  # Import multi-frame capture
@@ -39,8 +38,6 @@ except ImportError:
39
  HAS_FRAME_BUFFER = False
40
  print("[WebSocket] FrameBuffer not available - using single frame capture")
41
 
42
- AVATAR_API = get_avatar_api_url()
43
-
44
  async def websocket_endpoint(websocket: WebSocket):
45
  """Main WebSocket endpoint handler with multi-frame capture"""
46
 
@@ -200,36 +197,34 @@ async def websocket_endpoint(websocket: WebSocket):
200
  try:
201
  lang = user_preferences.get("language", "en")
202
  greeting_text = GREETINGS[lang]["returning" if user_summary else "new"].format(username=username)
203
-
204
- audio_url, visemes = None, None
205
- try:
206
- resp = requests.post(f"{AVATAR_API}/speak", data={
207
- "text": greeting_text,
208
- "voice": user_preferences.get("voice", "female"),
209
- "language": lang
210
- }, timeout=10)
211
- if resp.status_code == 200:
212
- avatar_data = resp.json()
213
- audio_url = avatar_data.get("audio_url")
214
- visemes = avatar_data.get("visemes")
215
- except:
216
- pass
217
-
218
  response_data = {
219
  "type": "llm_response",
220
  "text": greeting_text,
221
  "emotion": "Neutral",
222
  "intensity": 0.5,
223
- "is_greeting": True
 
 
224
  }
225
- if audio_url and visemes:
226
- response_data["audio_url"] = audio_url
227
- response_data["visemes"] = visemes
228
- else:
229
- response_data["text_only"] = True
230
-
231
  await websocket.send_json(response_data)
232
  save_message(session_id, "assistant", greeting_text, "Neutral")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  except Exception as err:
234
  print(f"[Greeting] Error: {err}", flush=True)
235
 
@@ -392,4 +387,4 @@ async def websocket_endpoint(websocket: WebSocket):
392
  print(f"[Summary] Error: {e}", flush=True)
393
 
394
  print(f"[WebSocket] Closed for {username or 'Unknown'}", flush=True)
395
- print("="*80 + "\n", flush=True)
 
16
  from starlette.websockets import WebSocketState
17
  import asyncio
18
  import base64
19
+ import secrets
20
  import numpy as np
21
  import cv2
22
  import io
23
  from PIL import Image
 
24
 
25
  from . import models as models_module
26
  from .session.manager import validate_token, save_message, load_user_history
27
  from .session.summary import generate_session_summary
28
  from .auth.database import get_db_connection
 
29
  from .config import GREETINGS
30
+ from .processing.speech import process_speech_end, stream_tts_audio_to_websocket
31
  from .processing.face_emotion_aggregator import FaceEmotionAggregator
32
 
33
  # Import multi-frame capture
 
38
  HAS_FRAME_BUFFER = False
39
  print("[WebSocket] FrameBuffer not available - using single frame capture")
40
 
 
 
41
  async def websocket_endpoint(websocket: WebSocket):
42
  """Main WebSocket endpoint handler with multi-frame capture"""
43
 
 
197
  try:
198
  lang = user_preferences.get("language", "en")
199
  greeting_text = GREETINGS[lang]["returning" if user_summary else "new"].format(username=username)
200
+ stream_id = secrets.token_urlsafe(8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  response_data = {
202
  "type": "llm_response",
203
  "text": greeting_text,
204
  "emotion": "Neutral",
205
  "intensity": 0.5,
206
+ "is_greeting": True,
207
+ "audio_stream": True,
208
+ "stream_id": stream_id,
209
  }
 
 
 
 
 
 
210
  await websocket.send_json(response_data)
211
  save_message(session_id, "assistant", greeting_text, "Neutral")
212
+
213
+ try:
214
+ await stream_tts_audio_to_websocket(
215
+ websocket=websocket,
216
+ text=greeting_text,
217
+ voice=user_preferences.get("voice", "female"),
218
+ language=lang,
219
+ stream_id=stream_id,
220
+ )
221
+ except Exception as tts_err:
222
+ print(f"[Greeting] TTS stream error: {tts_err}", flush=True)
223
+ await websocket.send_json({
224
+ "type": "tts_stream_error",
225
+ "stream_id": stream_id,
226
+ "message": str(tts_err),
227
+ })
228
  except Exception as err:
229
  print(f"[Greeting] Error: {err}", flush=True)
230
 
 
387
  print(f"[Summary] Error: {e}", flush=True)
388
 
389
  print(f"[WebSocket] Closed for {username or 'Unknown'}", flush=True)
390
+ print("="*80 + "\n", flush=True)