Spaces:
Paused
Paused
Update 2026-19-03 11:13
Browse files- avatar-frontend/app/app/page.tsx +184 -12
- avatar/speak_server.py +192 -103
- mrrrme/backend/processing/speech.py +105 -30
- mrrrme/backend/websocket.py +23 -28
avatar-frontend/app/app/page.tsx
CHANGED
|
@@ -23,6 +23,23 @@ interface AssistantAudioPayload {
|
|
| 23 |
audio_url: string;
|
| 24 |
}
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
interface SpeechRecognitionAlternativeLike {
|
| 27 |
transcript: string;
|
| 28 |
}
|
|
@@ -533,6 +550,13 @@ export default function AppPage() {
|
|
| 533 |
|
| 534 |
const audioContextRef = useRef<AudioContext | null>(null);
|
| 535 |
const audioProcessorRef = useRef<ScriptProcessorNode | null>(null);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
|
| 537 |
const selectedVoiceRef = useRef(selectedVoice);
|
| 538 |
const selectedPersonalityRef = useRef(selectedPersonality);
|
|
@@ -603,6 +627,29 @@ export default function AppPage() {
|
|
| 603 |
}
|
| 604 |
audioProcessorRef.current = null;
|
| 605 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
if (wsRef.current) {
|
| 607 |
try {
|
| 608 |
wsRef.current.close(1000, "Session ended");
|
|
@@ -696,6 +743,104 @@ export default function AppPage() {
|
|
| 696 |
}
|
| 697 |
};
|
| 698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
function connectWebSocket() {
|
| 700 |
const token = localStorage.getItem("mrrrme_token");
|
| 701 |
if (!token) {
|
|
@@ -745,6 +890,24 @@ export default function AppPage() {
|
|
| 745 |
},
|
| 746 |
]);
|
| 747 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 748 |
if (data.text_only || !data.audio_url) {
|
| 749 |
setStatus("Response ready");
|
| 750 |
setTimeout(() => {
|
|
@@ -770,6 +933,23 @@ export default function AppPage() {
|
|
| 770 |
}
|
| 771 |
}
|
| 772 |
await playAssistantResponse(data);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
}
|
| 774 |
};
|
| 775 |
ws.onerror = () => setStatus("Connection error");
|
|
@@ -948,6 +1128,9 @@ export default function AppPage() {
|
|
| 948 |
async function playAssistantResponse(data: AssistantAudioPayload) {
|
| 949 |
if (!audioRef.current) return;
|
| 950 |
|
|
|
|
|
|
|
|
|
|
| 951 |
if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
|
| 952 |
wsRef.current.send(JSON.stringify({ type: "tts_start" }));
|
| 953 |
}
|
|
@@ -981,18 +1164,7 @@ export default function AppPage() {
|
|
| 981 |
setStatus("Audio error");
|
| 982 |
}
|
| 983 |
|
| 984 |
-
|
| 985 |
-
setStatus(isPausedRef.current ? "Paused" : "Listening...");
|
| 986 |
-
shouldAutoRestartRef.current = true;
|
| 987 |
-
if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
|
| 988 |
-
wsRef.current.send(JSON.stringify({ type: "tts_end" }));
|
| 989 |
-
}
|
| 990 |
-
if (recognitionRef.current && !isPausedRef.current) {
|
| 991 |
-
try {
|
| 992 |
-
recognitionRef.current.start();
|
| 993 |
-
} catch {
|
| 994 |
-
}
|
| 995 |
-
}
|
| 996 |
}
|
| 997 |
|
| 998 |
return (
|
|
|
|
| 23 |
audio_url: string;
|
| 24 |
}
|
| 25 |
|
| 26 |
+
interface AssistantStreamPayload {
|
| 27 |
+
stream_id: string;
|
| 28 |
+
audio_stream?: boolean;
|
| 29 |
+
text_only?: boolean;
|
| 30 |
+
audio_url?: string;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
interface TtsStreamStartPayload {
|
| 34 |
+
stream_id: string;
|
| 35 |
+
sample_rate: number;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
interface TtsStreamChunkPayload {
|
| 39 |
+
stream_id: string;
|
| 40 |
+
audio: string;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
interface SpeechRecognitionAlternativeLike {
|
| 44 |
transcript: string;
|
| 45 |
}
|
|
|
|
| 550 |
|
| 551 |
const audioContextRef = useRef<AudioContext | null>(null);
|
| 552 |
const audioProcessorRef = useRef<ScriptProcessorNode | null>(null);
|
| 553 |
+
const ttsPlaybackContextRef = useRef<AudioContext | null>(null);
|
| 554 |
+
const ttsPlaybackTimeRef = useRef(0);
|
| 555 |
+
const ttsPendingSourcesRef = useRef(0);
|
| 556 |
+
const ttsPlaybackEndedRef = useRef(false);
|
| 557 |
+
const ttsStreamIdRef = useRef<string | null>(null);
|
| 558 |
+
const ttsStreamSampleRateRef = useRef<number>(24000);
|
| 559 |
+
const ttsSourceNodesRef = useRef<AudioBufferSourceNode[]>([]);
|
| 560 |
|
| 561 |
const selectedVoiceRef = useRef(selectedVoice);
|
| 562 |
const selectedPersonalityRef = useRef(selectedPersonality);
|
|
|
|
| 627 |
}
|
| 628 |
audioProcessorRef.current = null;
|
| 629 |
}
|
| 630 |
+
if (ttsPlaybackContextRef.current) {
|
| 631 |
+
try {
|
| 632 |
+
ttsPlaybackContextRef.current.close();
|
| 633 |
+
} catch {
|
| 634 |
+
}
|
| 635 |
+
ttsPlaybackContextRef.current = null;
|
| 636 |
+
}
|
| 637 |
+
ttsSourceNodesRef.current.forEach((source) => {
|
| 638 |
+
try {
|
| 639 |
+
source.stop();
|
| 640 |
+
} catch {
|
| 641 |
+
}
|
| 642 |
+
try {
|
| 643 |
+
source.disconnect();
|
| 644 |
+
} catch {
|
| 645 |
+
}
|
| 646 |
+
});
|
| 647 |
+
ttsSourceNodesRef.current = [];
|
| 648 |
+
ttsPlaybackTimeRef.current = 0;
|
| 649 |
+
ttsPendingSourcesRef.current = 0;
|
| 650 |
+
ttsPlaybackEndedRef.current = false;
|
| 651 |
+
ttsStreamIdRef.current = null;
|
| 652 |
+
ttsStreamSampleRateRef.current = 24000;
|
| 653 |
if (wsRef.current) {
|
| 654 |
try {
|
| 655 |
wsRef.current.close(1000, "Session ended");
|
|
|
|
| 743 |
}
|
| 744 |
};
|
| 745 |
|
| 746 |
+
const finishAssistantPlayback = () => {
|
| 747 |
+
setIsResponsePlaying(false);
|
| 748 |
+
setStatus(isPausedRef.current ? "Paused" : "Listening...");
|
| 749 |
+
shouldAutoRestartRef.current = true;
|
| 750 |
+
if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
|
| 751 |
+
wsRef.current.send(JSON.stringify({ type: "tts_end" }));
|
| 752 |
+
}
|
| 753 |
+
if (recognitionRef.current && !isPausedRef.current) {
|
| 754 |
+
try {
|
| 755 |
+
recognitionRef.current.start();
|
| 756 |
+
} catch {
|
| 757 |
+
}
|
| 758 |
+
}
|
| 759 |
+
};
|
| 760 |
+
|
| 761 |
+
const resetStreamingPlayback = () => {
|
| 762 |
+
if (audioRef.current) {
|
| 763 |
+
audioRef.current.pause();
|
| 764 |
+
audioRef.current.removeAttribute("src");
|
| 765 |
+
audioRef.current.load();
|
| 766 |
+
}
|
| 767 |
+
ttsSourceNodesRef.current.forEach((source) => {
|
| 768 |
+
try {
|
| 769 |
+
source.stop();
|
| 770 |
+
} catch {
|
| 771 |
+
}
|
| 772 |
+
try {
|
| 773 |
+
source.disconnect();
|
| 774 |
+
} catch {
|
| 775 |
+
}
|
| 776 |
+
});
|
| 777 |
+
ttsSourceNodesRef.current = [];
|
| 778 |
+
ttsPlaybackTimeRef.current = 0;
|
| 779 |
+
ttsPendingSourcesRef.current = 0;
|
| 780 |
+
ttsPlaybackEndedRef.current = false;
|
| 781 |
+
ttsStreamSampleRateRef.current = 24000;
|
| 782 |
+
};
|
| 783 |
+
|
| 784 |
+
const ensureTtsPlaybackContext = async () => {
|
| 785 |
+
if (!ttsPlaybackContextRef.current) {
|
| 786 |
+
ttsPlaybackContextRef.current = new AudioContext();
|
| 787 |
+
}
|
| 788 |
+
if (ttsPlaybackContextRef.current.state === "suspended") {
|
| 789 |
+
await ttsPlaybackContextRef.current.resume();
|
| 790 |
+
}
|
| 791 |
+
return ttsPlaybackContextRef.current;
|
| 792 |
+
};
|
| 793 |
+
|
| 794 |
+
const finalizeStreamIfComplete = () => {
|
| 795 |
+
if (!ttsPlaybackEndedRef.current || ttsPendingSourcesRef.current > 0) return;
|
| 796 |
+
ttsStreamIdRef.current = null;
|
| 797 |
+
finishAssistantPlayback();
|
| 798 |
+
};
|
| 799 |
+
|
| 800 |
+
const startStreamingPlayback = async (data: TtsStreamStartPayload) => {
|
| 801 |
+
if (ttsStreamIdRef.current !== data.stream_id) return;
|
| 802 |
+
const playbackContext = await ensureTtsPlaybackContext();
|
| 803 |
+
ttsStreamSampleRateRef.current = data.sample_rate;
|
| 804 |
+
ttsPlaybackTimeRef.current = Math.max(ttsPlaybackTimeRef.current, playbackContext.currentTime + 0.08);
|
| 805 |
+
setStatus("Speaking...");
|
| 806 |
+
};
|
| 807 |
+
|
| 808 |
+
const enqueueStreamingAudioChunk = async (data: TtsStreamChunkPayload) => {
|
| 809 |
+
if (ttsStreamIdRef.current !== data.stream_id) return;
|
| 810 |
+
const playbackContext = await ensureTtsPlaybackContext();
|
| 811 |
+
const binary = atob(data.audio);
|
| 812 |
+
const bytes = Uint8Array.from(binary, (char) => char.charCodeAt(0));
|
| 813 |
+
const pcm = new Int16Array(bytes.buffer, bytes.byteOffset, Math.floor(bytes.byteLength / 2));
|
| 814 |
+
const floatSamples = new Float32Array(pcm.length);
|
| 815 |
+
for (let i = 0; i < pcm.length; i++) {
|
| 816 |
+
floatSamples[i] = pcm[i] / 32768;
|
| 817 |
+
}
|
| 818 |
+
|
| 819 |
+
const audioBuffer = playbackContext.createBuffer(1, floatSamples.length, ttsStreamSampleRateRef.current);
|
| 820 |
+
audioBuffer.copyToChannel(floatSamples, 0);
|
| 821 |
+
|
| 822 |
+
const source = playbackContext.createBufferSource();
|
| 823 |
+
source.buffer = audioBuffer;
|
| 824 |
+
source.connect(playbackContext.destination);
|
| 825 |
+
|
| 826 |
+
const startAt = Math.max(ttsPlaybackTimeRef.current, playbackContext.currentTime + 0.05);
|
| 827 |
+
ttsPlaybackTimeRef.current = startAt + audioBuffer.duration;
|
| 828 |
+
ttsPendingSourcesRef.current += 1;
|
| 829 |
+
ttsSourceNodesRef.current.push(source);
|
| 830 |
+
|
| 831 |
+
source.onended = () => {
|
| 832 |
+
ttsPendingSourcesRef.current = Math.max(0, ttsPendingSourcesRef.current - 1);
|
| 833 |
+
ttsSourceNodesRef.current = ttsSourceNodesRef.current.filter((node) => node !== source);
|
| 834 |
+
try {
|
| 835 |
+
source.disconnect();
|
| 836 |
+
} catch {
|
| 837 |
+
}
|
| 838 |
+
finalizeStreamIfComplete();
|
| 839 |
+
};
|
| 840 |
+
|
| 841 |
+
source.start(startAt);
|
| 842 |
+
};
|
| 843 |
+
|
| 844 |
function connectWebSocket() {
|
| 845 |
const token = localStorage.getItem("mrrrme_token");
|
| 846 |
if (!token) {
|
|
|
|
| 890 |
},
|
| 891 |
]);
|
| 892 |
|
| 893 |
+
if (data.audio_stream && data.stream_id) {
|
| 894 |
+
resetStreamingPlayback();
|
| 895 |
+
ttsStreamIdRef.current = (data as AssistantStreamPayload).stream_id;
|
| 896 |
+
setStatus("Preparing audio...");
|
| 897 |
+
setIsResponsePlaying(true);
|
| 898 |
+
shouldAutoRestartRef.current = false;
|
| 899 |
+
if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
|
| 900 |
+
wsRef.current.send(JSON.stringify({ type: "tts_start" }));
|
| 901 |
+
}
|
| 902 |
+
if (recognitionRef.current) {
|
| 903 |
+
try {
|
| 904 |
+
recognitionRef.current.stop();
|
| 905 |
+
} catch {
|
| 906 |
+
}
|
| 907 |
+
}
|
| 908 |
+
return;
|
| 909 |
+
}
|
| 910 |
+
|
| 911 |
if (data.text_only || !data.audio_url) {
|
| 912 |
setStatus("Response ready");
|
| 913 |
setTimeout(() => {
|
|
|
|
| 933 |
}
|
| 934 |
}
|
| 935 |
await playAssistantResponse(data);
|
| 936 |
+
} else if (data.type === "tts_stream_start") {
|
| 937 |
+
await startStreamingPlayback(data as TtsStreamStartPayload);
|
| 938 |
+
} else if (data.type === "tts_stream_chunk") {
|
| 939 |
+
await enqueueStreamingAudioChunk(data as TtsStreamChunkPayload);
|
| 940 |
+
} else if (data.type === "tts_stream_end") {
|
| 941 |
+
if (ttsStreamIdRef.current === data.stream_id) {
|
| 942 |
+
ttsPlaybackEndedRef.current = true;
|
| 943 |
+
finalizeStreamIfComplete();
|
| 944 |
+
}
|
| 945 |
+
} else if (data.type === "tts_stream_error") {
|
| 946 |
+
if (!ttsStreamIdRef.current || ttsStreamIdRef.current === data.stream_id) {
|
| 947 |
+
console.error("[Audio] Stream error:", data.message);
|
| 948 |
+
setStatus("Audio error");
|
| 949 |
+
ttsStreamIdRef.current = null;
|
| 950 |
+
resetStreamingPlayback();
|
| 951 |
+
finishAssistantPlayback();
|
| 952 |
+
}
|
| 953 |
}
|
| 954 |
};
|
| 955 |
ws.onerror = () => setStatus("Connection error");
|
|
|
|
| 1128 |
async function playAssistantResponse(data: AssistantAudioPayload) {
|
| 1129 |
if (!audioRef.current) return;
|
| 1130 |
|
| 1131 |
+
resetStreamingPlayback();
|
| 1132 |
+
ttsStreamIdRef.current = null;
|
| 1133 |
+
|
| 1134 |
if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
|
| 1135 |
wsRef.current.send(JSON.stringify({ type: "tts_start" }));
|
| 1136 |
}
|
|
|
|
| 1164 |
setStatus("Audio error");
|
| 1165 |
}
|
| 1166 |
|
| 1167 |
+
finishAssistantPlayback();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1168 |
}
|
| 1169 |
|
| 1170 |
return (
|
avatar/speak_server.py
CHANGED
|
@@ -1,21 +1,19 @@
|
|
| 1 |
-
"""Avatar Backend - Qwen 3 TTS with
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
-
import
|
| 5 |
-
import
|
| 6 |
-
import subprocess
|
| 7 |
-
import json as json_lib
|
| 8 |
-
import asyncio
|
| 9 |
-
from fastapi import FastAPI, Form, WebSocket
|
| 10 |
from fastapi.middleware.cors import CORSMiddleware
|
| 11 |
from fastapi.staticfiles import StaticFiles
|
| 12 |
-
from fastapi.responses import JSONResponse, StreamingResponse
|
| 13 |
-
from
|
| 14 |
-
from
|
| 15 |
-
|
| 16 |
-
import
|
| 17 |
-
import numpy as np
|
| 18 |
-
import io
|
| 19 |
|
| 20 |
load_dotenv()
|
| 21 |
|
|
@@ -133,20 +131,121 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 133 |
active_connections.remove(websocket)
|
| 134 |
|
| 135 |
|
| 136 |
-
async def broadcast_to_avatars(data: dict):
|
| 137 |
-
for connection in active_connections[:]:
|
| 138 |
-
try:
|
| 139 |
-
await connection.send_json(data)
|
| 140 |
-
except:
|
| 141 |
-
if connection in active_connections:
|
| 142 |
-
active_connections.remove(connection)
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
# Quick model check
|
| 152 |
if qwen_tts_model is None or not MODEL_READY:
|
|
@@ -156,78 +255,68 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
|
|
| 156 |
content={"error": "TTS model still loading, please wait..."}
|
| 157 |
)
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
"
|
| 214 |
-
"
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
print(f"[TTS] ✅ Total: {time.time() - t_start:.2f}s\n")
|
| 223 |
-
|
| 224 |
-
return response_data
|
| 225 |
-
|
| 226 |
-
except Exception as e:
|
| 227 |
-
print(f"[TTS] ❌ Error: {e}")
|
| 228 |
-
import traceback
|
| 229 |
-
traceback.print_exc()
|
| 230 |
-
return JSONResponse(status_code=500, content={"error": str(e)})
|
| 231 |
|
| 232 |
|
| 233 |
@app.get("/health")
|
|
@@ -261,4 +350,4 @@ if __name__ == "__main__":
|
|
| 261 |
print("Optimization: torch.no_grad() + fast visemes")
|
| 262 |
print("Expected latency: 1-3 seconds (vs 5-10s before)")
|
| 263 |
print("="*60 + "\n")
|
| 264 |
-
uvicorn.run(app, host="0.0.0.0", port=8765)
|
|
|
|
| 1 |
+
"""Avatar Backend - Qwen 3 TTS with chunked PCM streaming."""
|
| 2 |
+
import base64
|
| 3 |
+
import os
|
| 4 |
+
import uuid
|
| 5 |
+
import time
|
| 6 |
+
import subprocess
|
| 7 |
+
import json as json_lib
|
| 8 |
+
import asyncio
|
| 9 |
+
from fastapi import FastAPI, Form, WebSocket
|
| 10 |
from fastapi.middleware.cors import CORSMiddleware
|
| 11 |
from fastapi.staticfiles import StaticFiles
|
| 12 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 13 |
+
from typing import List, Optional
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
import torch
|
| 16 |
+
import numpy as np
|
|
|
|
|
|
|
| 17 |
|
| 18 |
load_dotenv()
|
| 19 |
|
|
|
|
| 131 |
active_connections.remove(websocket)
|
| 132 |
|
| 133 |
|
| 134 |
+
async def broadcast_to_avatars(data: dict):
|
| 135 |
+
for connection in active_connections[:]:
|
| 136 |
+
try:
|
| 137 |
+
await connection.send_json(data)
|
| 138 |
+
except:
|
| 139 |
+
if connection in active_connections:
|
| 140 |
+
active_connections.remove(connection)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def synthesize_speech(
|
| 144 |
+
text: str,
|
| 145 |
+
voice: str,
|
| 146 |
+
language: str,
|
| 147 |
+
request_id: Optional[str] = None,
|
| 148 |
+
) -> dict:
|
| 149 |
+
"""Run TTS once and return reusable response metadata."""
|
| 150 |
+
t_start = time.time()
|
| 151 |
+
uid = request_id or uuid.uuid4().hex[:8]
|
| 152 |
+
wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
|
| 153 |
+
|
| 154 |
+
language_map = {
|
| 155 |
+
"en": "English", "nl": "Auto", "zh": "Chinese", "ja": "Japanese",
|
| 156 |
+
"ko": "Korean", "de": "German", "fr": "French", "ru": "Russian",
|
| 157 |
+
"pt": "Portuguese", "es": "Spanish", "it": "Italian"
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
qwen_language = language_map.get(language, "English")
|
| 161 |
+
speaker_name = VOICE_MAP.get(voice, "Serena")
|
| 162 |
+
|
| 163 |
+
print(f"[TTS] 🎤 {speaker_name}: '{text[:40]}...'")
|
| 164 |
+
|
| 165 |
+
with torch.no_grad():
|
| 166 |
+
wavs, sample_rate = qwen_tts_model.generate_custom_voice(
|
| 167 |
+
text=text,
|
| 168 |
+
language=qwen_language,
|
| 169 |
+
speaker=speaker_name,
|
| 170 |
+
max_new_tokens=1024,
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
audio_array = wavs[0] if isinstance(wavs, list) else wavs
|
| 174 |
+
if isinstance(audio_array, torch.Tensor):
|
| 175 |
+
audio_array = audio_array.cpu().numpy()
|
| 176 |
+
audio_array = audio_array.squeeze()
|
| 177 |
+
|
| 178 |
+
if audio_array.max() <= 1.0:
|
| 179 |
+
audio_array = (audio_array * 32767).astype(np.int16)
|
| 180 |
+
else:
|
| 181 |
+
audio_array = audio_array.astype(np.int16)
|
| 182 |
+
|
| 183 |
+
import scipy.io.wavfile
|
| 184 |
+
scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_array)
|
| 185 |
+
|
| 186 |
+
duration_sec = len(audio_array) / sample_rate
|
| 187 |
+
generation_time = time.time() - t_start
|
| 188 |
+
print(f"[TTS] ⚡ Generated in {generation_time:.2f}s ({duration_sec:.1f}s audio)")
|
| 189 |
+
|
| 190 |
+
visemes = generate_visemes_fast(text, duration_sec)
|
| 191 |
+
|
| 192 |
+
return {
|
| 193 |
+
"stream_id": uid,
|
| 194 |
+
"audio_array": audio_array,
|
| 195 |
+
"sample_rate": sample_rate,
|
| 196 |
+
"wav_path": wav_path,
|
| 197 |
+
"visemes": visemes,
|
| 198 |
+
"duration": duration_sec,
|
| 199 |
+
"text": text,
|
| 200 |
+
"method": "fast_phoneme",
|
| 201 |
+
"generation_time": round(generation_time, 2),
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def iter_ndjson_stream(payload: dict, chunk_duration_ms: int = 120):
|
| 206 |
+
"""Yield newline-delimited JSON stream frames with PCM16 audio chunks."""
|
| 207 |
+
audio_bytes = payload["audio_array"].tobytes()
|
| 208 |
+
sample_rate = int(payload["sample_rate"])
|
| 209 |
+
samples_per_chunk = max(1, int(sample_rate * (chunk_duration_ms / 1000.0)))
|
| 210 |
+
chunk_size_bytes = samples_per_chunk * 2 # mono PCM16
|
| 211 |
+
|
| 212 |
+
start_event = {
|
| 213 |
+
"type": "start",
|
| 214 |
+
"stream_id": payload["stream_id"],
|
| 215 |
+
"sample_rate": sample_rate,
|
| 216 |
+
"channels": 1,
|
| 217 |
+
"duration": payload["duration"],
|
| 218 |
+
"generation_time": payload["generation_time"],
|
| 219 |
+
}
|
| 220 |
+
yield f"{json_lib.dumps(start_event)}\n"
|
| 221 |
+
|
| 222 |
+
for offset in range(0, len(audio_bytes), chunk_size_bytes):
|
| 223 |
+
chunk = audio_bytes[offset: offset + chunk_size_bytes]
|
| 224 |
+
chunk_event = {
|
| 225 |
+
"type": "chunk",
|
| 226 |
+
"stream_id": payload["stream_id"],
|
| 227 |
+
"audio": base64.b64encode(chunk).decode("ascii"),
|
| 228 |
+
}
|
| 229 |
+
yield f"{json_lib.dumps(chunk_event)}\n"
|
| 230 |
+
|
| 231 |
+
end_event = {
|
| 232 |
+
"type": "end",
|
| 233 |
+
"stream_id": payload["stream_id"],
|
| 234 |
+
"duration": payload["duration"],
|
| 235 |
+
}
|
| 236 |
+
yield f"{json_lib.dumps(end_event)}\n"
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
# ============ FAST TTS ENDPOINTS ============
|
| 240 |
+
|
| 241 |
+
@app.post("/speak")
|
| 242 |
+
async def speak(
|
| 243 |
+
text: str = Form(...),
|
| 244 |
+
voice: str = Form("female"),
|
| 245 |
+
language: str = Form("en"),
|
| 246 |
+
request_id: Optional[str] = Form(None),
|
| 247 |
+
):
|
| 248 |
+
"""Generate TTS quickly with optimized settings"""
|
| 249 |
|
| 250 |
# Quick model check
|
| 251 |
if qwen_tts_model is None or not MODEL_READY:
|
|
|
|
| 255 |
content={"error": "TTS model still loading, please wait..."}
|
| 256 |
)
|
| 257 |
|
| 258 |
+
try:
|
| 259 |
+
payload = synthesize_speech(text=text, voice=voice, language=language, request_id=request_id)
|
| 260 |
+
response_data = {
|
| 261 |
+
"stream_id": payload["stream_id"],
|
| 262 |
+
"audio_url": f"/static/{os.path.basename(payload['wav_path'])}",
|
| 263 |
+
"visemes": payload["visemes"],
|
| 264 |
+
"duration": payload["duration"],
|
| 265 |
+
"text": payload["text"],
|
| 266 |
+
"method": payload["method"],
|
| 267 |
+
"generation_time": payload["generation_time"],
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
await broadcast_to_avatars(response_data)
|
| 271 |
+
|
| 272 |
+
print(f"[TTS] ✅ Total: {payload['generation_time']:.2f}s\n")
|
| 273 |
+
|
| 274 |
+
return response_data
|
| 275 |
+
|
| 276 |
+
except Exception as e:
|
| 277 |
+
print(f"[TTS] ❌ Error: {e}")
|
| 278 |
+
import traceback
|
| 279 |
+
traceback.print_exc()
|
| 280 |
+
return JSONResponse(status_code=500, content={"error": str(e)})
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
@app.post("/speak_stream")
|
| 284 |
+
async def speak_stream(
|
| 285 |
+
text: str = Form(...),
|
| 286 |
+
voice: str = Form("female"),
|
| 287 |
+
language: str = Form("en"),
|
| 288 |
+
request_id: Optional[str] = Form(None),
|
| 289 |
+
):
|
| 290 |
+
"""Stream synthesized PCM chunks as newline-delimited JSON."""
|
| 291 |
+
if qwen_tts_model is None or not MODEL_READY:
|
| 292 |
+
return JSONResponse(
|
| 293 |
+
status_code=503,
|
| 294 |
+
content={"error": "TTS model still loading, please wait..."},
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
try:
|
| 298 |
+
payload = synthesize_speech(text=text, voice=voice, language=language, request_id=request_id)
|
| 299 |
+
response_data = {
|
| 300 |
+
"stream_id": payload["stream_id"],
|
| 301 |
+
"audio_url": f"/static/{os.path.basename(payload['wav_path'])}",
|
| 302 |
+
"visemes": payload["visemes"],
|
| 303 |
+
"duration": payload["duration"],
|
| 304 |
+
"text": payload["text"],
|
| 305 |
+
"method": payload["method"],
|
| 306 |
+
"generation_time": payload["generation_time"],
|
| 307 |
+
}
|
| 308 |
+
await broadcast_to_avatars(response_data)
|
| 309 |
+
|
| 310 |
+
return StreamingResponse(
|
| 311 |
+
iter_ndjson_stream(payload),
|
| 312 |
+
media_type="application/x-ndjson",
|
| 313 |
+
headers={"Cache-Control": "no-cache"},
|
| 314 |
+
)
|
| 315 |
+
except Exception as e:
|
| 316 |
+
print(f"[TTS] ❌ Stream error: {e}")
|
| 317 |
+
import traceback
|
| 318 |
+
traceback.print_exc()
|
| 319 |
+
return JSONResponse(status_code=500, content={"error": str(e)})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
|
| 322 |
@app.get("/health")
|
|
|
|
| 350 |
print("Optimization: torch.no_grad() + fast visemes")
|
| 351 |
print("Expected latency: 1-3 seconds (vs 5-10s before)")
|
| 352 |
print("="*60 + "\n")
|
| 353 |
+
uvicorn.run(app, host="0.0.0.0", port=8765)
|
mrrrme/backend/processing/speech.py
CHANGED
|
@@ -20,6 +20,7 @@ import requests
|
|
| 20 |
import numpy as np
|
| 21 |
import secrets
|
| 22 |
import json
|
|
|
|
| 23 |
from datetime import datetime
|
| 24 |
from typing import Optional, Dict
|
| 25 |
from ..models.loader import get_models
|
|
@@ -71,6 +72,86 @@ def filter_transcription(transcription: str) -> tuple:
|
|
| 71 |
return True, None
|
| 72 |
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def save_emotion_prediction(
|
| 75 |
message_id: str,
|
| 76 |
session_id: str,
|
|
@@ -533,40 +614,34 @@ async def process_speech_end(
|
|
| 533 |
|
| 534 |
save_message(session_id, "assistant", response_text, fused_emotion, user_id)
|
| 535 |
|
| 536 |
-
# ========== SEND
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
try:
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
},
|
| 545 |
-
timeout=45
|
| 546 |
)
|
| 547 |
-
avatar_response.raise_for_status()
|
| 548 |
-
avatar_data = avatar_response.json()
|
| 549 |
-
|
| 550 |
-
await websocket.send_json({
|
| 551 |
-
"type": "llm_response",
|
| 552 |
-
"text": response_text,
|
| 553 |
-
"emotion": fused_emotion,
|
| 554 |
-
"intensity": intensity,
|
| 555 |
-
"is_masking": is_masking,
|
| 556 |
-
"masking_type": masking_type,
|
| 557 |
-
"audio_url": avatar_data.get("audio_url"),
|
| 558 |
-
"visemes": avatar_data.get("visemes"),
|
| 559 |
-
"frames_captured": frame_count # NEW: Include frame count
|
| 560 |
-
})
|
| 561 |
except Exception as avatar_err:
|
| 562 |
safe_print(f"[TTS] Error: {avatar_err}")
|
| 563 |
await websocket.send_json({
|
| 564 |
-
"type": "
|
| 565 |
-
"
|
| 566 |
-
"
|
| 567 |
-
"intensity": intensity,
|
| 568 |
-
"is_masking": is_masking,
|
| 569 |
-
"text_only": True
|
| 570 |
})
|
| 571 |
|
| 572 |
safe_print(f"[Pipeline] Complete for {username}")
|
|
@@ -575,4 +650,4 @@ async def process_speech_end(
|
|
| 575 |
except Exception as e:
|
| 576 |
safe_print(f"[Pipeline] Error: {e}")
|
| 577 |
import traceback
|
| 578 |
-
traceback.print_exc()
|
|
|
|
| 20 |
import numpy as np
|
| 21 |
import secrets
|
| 22 |
import json
|
| 23 |
+
import asyncio
|
| 24 |
from datetime import datetime
|
| 25 |
from typing import Optional, Dict
|
| 26 |
from ..models.loader import get_models
|
|
|
|
| 72 |
return True, None
|
| 73 |
|
| 74 |
|
| 75 |
+
async def stream_tts_audio_to_websocket(
|
| 76 |
+
websocket,
|
| 77 |
+
text: str,
|
| 78 |
+
voice: str,
|
| 79 |
+
language: str,
|
| 80 |
+
stream_id: str,
|
| 81 |
+
) -> None:
|
| 82 |
+
"""Proxy chunked PCM audio from the TTS service to the browser websocket."""
|
| 83 |
+
loop = asyncio.get_running_loop()
|
| 84 |
+
event_queue: asyncio.Queue[tuple[str, Optional[dict], Optional[str]]] = asyncio.Queue()
|
| 85 |
+
|
| 86 |
+
def read_stream() -> None:
|
| 87 |
+
try:
|
| 88 |
+
with requests.post(
|
| 89 |
+
f"{AVATAR_API}/speak_stream",
|
| 90 |
+
data={
|
| 91 |
+
"text": text,
|
| 92 |
+
"voice": voice,
|
| 93 |
+
"language": language,
|
| 94 |
+
"request_id": stream_id,
|
| 95 |
+
},
|
| 96 |
+
stream=True,
|
| 97 |
+
timeout=(10, 120),
|
| 98 |
+
) as response:
|
| 99 |
+
response.raise_for_status()
|
| 100 |
+
for raw_line in response.iter_lines(decode_unicode=True):
|
| 101 |
+
if not raw_line:
|
| 102 |
+
continue
|
| 103 |
+
event = json.loads(raw_line)
|
| 104 |
+
asyncio.run_coroutine_threadsafe(
|
| 105 |
+
event_queue.put(("event", event, None)),
|
| 106 |
+
loop,
|
| 107 |
+
).result()
|
| 108 |
+
except Exception as exc:
|
| 109 |
+
asyncio.run_coroutine_threadsafe(
|
| 110 |
+
event_queue.put(("error", None, str(exc))),
|
| 111 |
+
loop,
|
| 112 |
+
).result()
|
| 113 |
+
finally:
|
| 114 |
+
asyncio.run_coroutine_threadsafe(
|
| 115 |
+
event_queue.put(("done", None, None)),
|
| 116 |
+
loop,
|
| 117 |
+
).result()
|
| 118 |
+
|
| 119 |
+
reader_task = asyncio.create_task(asyncio.to_thread(read_stream))
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
while True:
|
| 123 |
+
event_type, event, error_message = await event_queue.get()
|
| 124 |
+
if event_type == "event" and event is not None:
|
| 125 |
+
upstream_type = event.get("type")
|
| 126 |
+
if upstream_type == "start":
|
| 127 |
+
await websocket.send_json({
|
| 128 |
+
"type": "tts_stream_start",
|
| 129 |
+
"stream_id": stream_id,
|
| 130 |
+
"sample_rate": event.get("sample_rate"),
|
| 131 |
+
"channels": event.get("channels", 1),
|
| 132 |
+
"duration": event.get("duration"),
|
| 133 |
+
"generation_time": event.get("generation_time"),
|
| 134 |
+
})
|
| 135 |
+
elif upstream_type == "chunk":
|
| 136 |
+
await websocket.send_json({
|
| 137 |
+
"type": "tts_stream_chunk",
|
| 138 |
+
"stream_id": stream_id,
|
| 139 |
+
"audio": event.get("audio"),
|
| 140 |
+
})
|
| 141 |
+
elif upstream_type == "end":
|
| 142 |
+
await websocket.send_json({
|
| 143 |
+
"type": "tts_stream_end",
|
| 144 |
+
"stream_id": stream_id,
|
| 145 |
+
"duration": event.get("duration"),
|
| 146 |
+
})
|
| 147 |
+
elif event_type == "error":
|
| 148 |
+
raise RuntimeError(error_message or "Unknown TTS stream error")
|
| 149 |
+
elif event_type == "done":
|
| 150 |
+
break
|
| 151 |
+
finally:
|
| 152 |
+
await reader_task
|
| 153 |
+
|
| 154 |
+
|
| 155 |
def save_emotion_prediction(
|
| 156 |
message_id: str,
|
| 157 |
session_id: str,
|
|
|
|
| 614 |
|
| 615 |
save_message(session_id, "assistant", response_text, fused_emotion, user_id)
|
| 616 |
|
| 617 |
+
# ========== SEND TEXT IMMEDIATELY, THEN STREAM TTS ==========
|
| 618 |
+
stream_id = secrets.token_urlsafe(8)
|
| 619 |
+
await websocket.send_json({
|
| 620 |
+
"type": "llm_response",
|
| 621 |
+
"text": response_text,
|
| 622 |
+
"emotion": fused_emotion,
|
| 623 |
+
"intensity": intensity,
|
| 624 |
+
"is_masking": is_masking,
|
| 625 |
+
"masking_type": masking_type,
|
| 626 |
+
"frames_captured": frame_count,
|
| 627 |
+
"audio_stream": True,
|
| 628 |
+
"stream_id": stream_id,
|
| 629 |
+
})
|
| 630 |
+
|
| 631 |
try:
|
| 632 |
+
await stream_tts_audio_to_websocket(
|
| 633 |
+
websocket=websocket,
|
| 634 |
+
text=response_text,
|
| 635 |
+
voice=user_preferences.get("voice", "female"),
|
| 636 |
+
language=user_preferences.get("language", "en"),
|
| 637 |
+
stream_id=stream_id,
|
|
|
|
|
|
|
| 638 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
except Exception as avatar_err:
|
| 640 |
safe_print(f"[TTS] Error: {avatar_err}")
|
| 641 |
await websocket.send_json({
|
| 642 |
+
"type": "tts_stream_error",
|
| 643 |
+
"stream_id": stream_id,
|
| 644 |
+
"message": str(avatar_err),
|
|
|
|
|
|
|
|
|
|
| 645 |
})
|
| 646 |
|
| 647 |
safe_print(f"[Pipeline] Complete for {username}")
|
|
|
|
| 650 |
except Exception as e:
|
| 651 |
safe_print(f"[Pipeline] Error: {e}")
|
| 652 |
import traceback
|
| 653 |
+
traceback.print_exc()
|
mrrrme/backend/websocket.py
CHANGED
|
@@ -16,19 +16,18 @@ from fastapi import WebSocket, WebSocketDisconnect
|
|
| 16 |
from starlette.websockets import WebSocketState
|
| 17 |
import asyncio
|
| 18 |
import base64
|
|
|
|
| 19 |
import numpy as np
|
| 20 |
import cv2
|
| 21 |
import io
|
| 22 |
from PIL import Image
|
| 23 |
-
import requests
|
| 24 |
|
| 25 |
from . import models as models_module
|
| 26 |
from .session.manager import validate_token, save_message, load_user_history
|
| 27 |
from .session.summary import generate_session_summary
|
| 28 |
from .auth.database import get_db_connection
|
| 29 |
-
from .utils.helpers import get_avatar_api_url
|
| 30 |
from .config import GREETINGS
|
| 31 |
-
from .processing.speech import process_speech_end
|
| 32 |
from .processing.face_emotion_aggregator import FaceEmotionAggregator
|
| 33 |
|
| 34 |
# Import multi-frame capture
|
|
@@ -39,8 +38,6 @@ except ImportError:
|
|
| 39 |
HAS_FRAME_BUFFER = False
|
| 40 |
print("[WebSocket] FrameBuffer not available - using single frame capture")
|
| 41 |
|
| 42 |
-
AVATAR_API = get_avatar_api_url()
|
| 43 |
-
|
| 44 |
async def websocket_endpoint(websocket: WebSocket):
|
| 45 |
"""Main WebSocket endpoint handler with multi-frame capture"""
|
| 46 |
|
|
@@ -200,36 +197,34 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 200 |
try:
|
| 201 |
lang = user_preferences.get("language", "en")
|
| 202 |
greeting_text = GREETINGS[lang]["returning" if user_summary else "new"].format(username=username)
|
| 203 |
-
|
| 204 |
-
audio_url, visemes = None, None
|
| 205 |
-
try:
|
| 206 |
-
resp = requests.post(f"{AVATAR_API}/speak", data={
|
| 207 |
-
"text": greeting_text,
|
| 208 |
-
"voice": user_preferences.get("voice", "female"),
|
| 209 |
-
"language": lang
|
| 210 |
-
}, timeout=10)
|
| 211 |
-
if resp.status_code == 200:
|
| 212 |
-
avatar_data = resp.json()
|
| 213 |
-
audio_url = avatar_data.get("audio_url")
|
| 214 |
-
visemes = avatar_data.get("visemes")
|
| 215 |
-
except:
|
| 216 |
-
pass
|
| 217 |
-
|
| 218 |
response_data = {
|
| 219 |
"type": "llm_response",
|
| 220 |
"text": greeting_text,
|
| 221 |
"emotion": "Neutral",
|
| 222 |
"intensity": 0.5,
|
| 223 |
-
"is_greeting": True
|
|
|
|
|
|
|
| 224 |
}
|
| 225 |
-
if audio_url and visemes:
|
| 226 |
-
response_data["audio_url"] = audio_url
|
| 227 |
-
response_data["visemes"] = visemes
|
| 228 |
-
else:
|
| 229 |
-
response_data["text_only"] = True
|
| 230 |
-
|
| 231 |
await websocket.send_json(response_data)
|
| 232 |
save_message(session_id, "assistant", greeting_text, "Neutral")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
except Exception as err:
|
| 234 |
print(f"[Greeting] Error: {err}", flush=True)
|
| 235 |
|
|
@@ -392,4 +387,4 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 392 |
print(f"[Summary] Error: {e}", flush=True)
|
| 393 |
|
| 394 |
print(f"[WebSocket] Closed for {username or 'Unknown'}", flush=True)
|
| 395 |
-
print("="*80 + "\n", flush=True)
|
|
|
|
| 16 |
from starlette.websockets import WebSocketState
|
| 17 |
import asyncio
|
| 18 |
import base64
|
| 19 |
+
import secrets
|
| 20 |
import numpy as np
|
| 21 |
import cv2
|
| 22 |
import io
|
| 23 |
from PIL import Image
|
|
|
|
| 24 |
|
| 25 |
from . import models as models_module
|
| 26 |
from .session.manager import validate_token, save_message, load_user_history
|
| 27 |
from .session.summary import generate_session_summary
|
| 28 |
from .auth.database import get_db_connection
|
|
|
|
| 29 |
from .config import GREETINGS
|
| 30 |
+
from .processing.speech import process_speech_end, stream_tts_audio_to_websocket
|
| 31 |
from .processing.face_emotion_aggregator import FaceEmotionAggregator
|
| 32 |
|
| 33 |
# Import multi-frame capture
|
|
|
|
| 38 |
HAS_FRAME_BUFFER = False
|
| 39 |
print("[WebSocket] FrameBuffer not available - using single frame capture")
|
| 40 |
|
|
|
|
|
|
|
| 41 |
async def websocket_endpoint(websocket: WebSocket):
|
| 42 |
"""Main WebSocket endpoint handler with multi-frame capture"""
|
| 43 |
|
|
|
|
| 197 |
try:
|
| 198 |
lang = user_preferences.get("language", "en")
|
| 199 |
greeting_text = GREETINGS[lang]["returning" if user_summary else "new"].format(username=username)
|
| 200 |
+
stream_id = secrets.token_urlsafe(8)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
response_data = {
|
| 202 |
"type": "llm_response",
|
| 203 |
"text": greeting_text,
|
| 204 |
"emotion": "Neutral",
|
| 205 |
"intensity": 0.5,
|
| 206 |
+
"is_greeting": True,
|
| 207 |
+
"audio_stream": True,
|
| 208 |
+
"stream_id": stream_id,
|
| 209 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
await websocket.send_json(response_data)
|
| 211 |
save_message(session_id, "assistant", greeting_text, "Neutral")
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
await stream_tts_audio_to_websocket(
|
| 215 |
+
websocket=websocket,
|
| 216 |
+
text=greeting_text,
|
| 217 |
+
voice=user_preferences.get("voice", "female"),
|
| 218 |
+
language=lang,
|
| 219 |
+
stream_id=stream_id,
|
| 220 |
+
)
|
| 221 |
+
except Exception as tts_err:
|
| 222 |
+
print(f"[Greeting] TTS stream error: {tts_err}", flush=True)
|
| 223 |
+
await websocket.send_json({
|
| 224 |
+
"type": "tts_stream_error",
|
| 225 |
+
"stream_id": stream_id,
|
| 226 |
+
"message": str(tts_err),
|
| 227 |
+
})
|
| 228 |
except Exception as err:
|
| 229 |
print(f"[Greeting] Error: {err}", flush=True)
|
| 230 |
|
|
|
|
| 387 |
print(f"[Summary] Error: {e}", flush=True)
|
| 388 |
|
| 389 |
print(f"[WebSocket] Closed for {username or 'Unknown'}", flush=True)
|
| 390 |
+
print("="*80 + "\n", flush=True)
|