Spaces:

marcosremar2
/

parle-s2s-light

Runtime error

App Files Files Community

marcosremar2 commited on Feb 25

Commit

cd971ed

verified ·

1 Parent(s): 1ec7997

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +272 -2

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ Endpoints (same contract as the full pipeline):
   GET  /health             — Health check
   GET  /capabilities       — Model info
   WS   /ws/stream          — WebSocket stream
 """
 import asyncio
@@ -27,13 +29,26 @@ from pathlib import Path
 import numpy as np
 import soundfile as sf
 import torch
-from fastapi import FastAPI, UploadFile, File, Form, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
 from faster_whisper import WhisperModel
 from gtts import gTTS
 from transformers import AutoModelForCausalLM, AutoTokenizer
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("parle-light")
@@ -46,6 +61,178 @@ last_activity = time.time()
 IDLE_SHUTDOWN_SECONDS = int(os.environ.get("IDLE_SHUTDOWN_SECONDS", "300"))  # 5 min
 SYSTEM_PROMPT = """Voce e um tutor de idiomas amigavel e paciente chamado Parle.
 Responda de forma concisa (1-3 frases) e adapte ao nivel do aluno.
 Se o aluno falar em portugues, responda em portugues.
@@ -93,6 +280,16 @@ async def lifespan(app: FastAPI):
     # Start idle watchdog
     asyncio.create_task(idle_watchdog())
     logger.info("All models ready!")
     yield
@@ -224,12 +421,15 @@ async def health():
 async def capabilities():
     global last_activity
     last_activity = time.time()
     return {
         "pipeline": "light",
         "stt": {"model": "faster-whisper-small", "languages": ["auto"]},
         "llm": {"model": "qwen2.5-0.5b-instruct", "max_tokens": 256},
         "tts": {"model": "gtts", "languages": ["pt", "en", "es", "fr", "de", "it"]},
-        "protocols": ["sse", "websocket"],
     }
@@ -354,3 +554,73 @@ async def ws_stream(ws: WebSocket):
             await ws.close()
         except Exception:
             pass

   GET  /health             — Health check
   GET  /capabilities       — Model info
   WS   /ws/stream          — WebSocket stream
+  POST /api/offer          — WebRTC SDP offer/answer (requires aiortc)
+  GET  /api/ice-servers    — ICE server config for WebRTC clients
 """
 import asyncio
 import numpy as np
 import soundfile as sf
 import torch
+from fastapi import FastAPI, Request, UploadFile, File, Form, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, JSONResponse
 from faster_whisper import WhisperModel
 from gtts import gTTS
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# WebRTC via aiortc — optional, graceful fallback if not installed
+try:
+    from aiortc import RTCPeerConnection as _RTCPeerConnection
+    from aiortc import RTCSessionDescription as _RTCSessionDescription
+    from aiortc import RTCConfiguration as _RTCConfiguration
+    from aiortc import RTCIceServer as _RTCIceServer
+    from aiortc import MediaStreamTrack as _MediaStreamTrack
+    from aiortc.contrib.media import MediaRelay as _MediaRelay
+    _AIORTC_AVAILABLE = True
+except ImportError:
+    _AIORTC_AVAILABLE = False
+    _MediaStreamTrack = object  # fallback base class for AudioProcessTrack
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("parle-light")
 IDLE_SHUTDOWN_SECONDS = int(os.environ.get("IDLE_SHUTDOWN_SECONDS", "300"))  # 5 min
+# WebRTC global state (initialized at startup if aiortc is available)
+_webrtc_relay = None
+_webrtc_pcs: set = set()
+_webrtc_available = False
+def _build_rtc_config():
+    """Build RTCConfiguration with STUN + optional TURN servers from env vars."""
+    if not _AIORTC_AVAILABLE:
+        return None
+    servers = [_RTCIceServer(urls=["stun:stun.l.google.com:19302"])]
+    turn_url = os.environ.get("TURN_URL", "")
+    turn_user = os.environ.get("TURN_USERNAME", "")
+    turn_cred = os.environ.get("TURN_CREDENTIAL", "")
+    if turn_url:
+        urls = [u.strip() for u in turn_url.split(",")]
+        servers.append(_RTCIceServer(urls=urls, username=turn_user, credential=turn_cred))
+    return _RTCConfiguration(iceServers=servers)
+# ── WebRTC AudioProcessTrack ─────────────────────────────────────────────────
+class AudioProcessTrack(_MediaStreamTrack):
+    """Receives audio from WebRTC client, runs STT→LLM→TTS pipeline, streams back."""
+    kind = "audio"
+    def __init__(self, track, pc):
+        super().__init__()
+        self.track = track
+        self.pc = pc
+        self._queue = asyncio.Queue()
+        self._task = None
+        self._audio_buffer = bytearray()
+        self._silence_frames = 0
+        self._speaking = False
+        self._silence_pts = 0
+    async def recv(self):
+        import fractions
+        try:
+            frame = await asyncio.wait_for(self._queue.get(), timeout=0.02)
+            return frame
+        except asyncio.TimeoutError:
+            from av import AudioFrame
+            frame = AudioFrame(format='s16', layout='mono', samples=960)
+            frame.planes[0].update(b'\x00' * 1920)
+            frame.sample_rate = 16000
+            frame.pts = self._silence_pts
+            frame.time_base = fractions.Fraction(1, 16000)
+            self._silence_pts += 960
+            return frame
+    async def start_processing(self):
+        self._task = asyncio.ensure_future(self._collect_loop())
+    async def _collect_loop(self):
+        try:
+            while True:
+                frame = await self.track.recv()
+                raw = bytes(frame.planes[0])
+                self._audio_buffer.extend(raw)
+                samples = np.frombuffer(raw, dtype=np.int16)
+                energy = np.sqrt(np.mean(samples.astype(np.float32) ** 2))
+                if energy > 500:
+                    self._speaking = True
+                    self._silence_frames = 0
+                elif self._speaking:
+                    self._silence_frames += 1
+                    if self._silence_frames > 50:
+                        self._speaking = False
+                        await self._process_buffer()
+        except Exception as e:
+            logger.info(f"[WebRTC] Collect loop ended: {e}")
+            if self._speaking and len(self._audio_buffer) > 1000:
+                await self._process_buffer()
+    async def _process_buffer(self):
+        import fractions
+        if len(self._audio_buffer) < 1000:
+            self._audio_buffer = bytearray()
+            return
+        raw_pcm = bytes(self._audio_buffer)
+        self._audio_buffer = bytearray()
+        # Build WAV from raw PCM (16kHz mono 16-bit)
+        sample_rate = 16000
+        wav_header = struct.pack('<4sI4s4sIHHIIHH4sI',
+            b'RIFF', 36 + len(raw_pcm), b'WAVE',
+            b'fmt ', 16, 1, 1, sample_rate, sample_rate * 2, 2, 16,
+            b'data', len(raw_pcm))
+        wav_data = wav_header + raw_pcm
+        # Find DataChannel for status updates
+        dc = None
+        for channel in getattr(self.pc, '_data_channels', []):
+            if channel.label == 'control':
+                dc = channel
+                break
+        async def send_dc(msg):
+            if dc and dc.readyState == 'open':
+                dc.send(json.dumps(msg))
+        try:
+            t_start = time.time()
+            # 1. STT
+            await send_dc({"status": "processing", "stage": "stt"})
+            t_stt = time.time()
+            transcript = await asyncio.to_thread(transcribe, wav_data)
+            stt_ms = int((time.time() - t_stt) * 1000)
+            if not transcript:
+                await send_dc({"status": "error", "message": "No speech detected"})
+                return
+            await send_dc({"status": "processing", "stage": "llm", "transcript": transcript, "stt_ms": stt_ms})
+            # 2. LLM
+            t_llm = time.time()
+            response_text = await asyncio.to_thread(generate_response, transcript)
+            llm_ms = int((time.time() - t_llm) * 1000)
+            await send_dc({"status": "processing", "stage": "tts", "response": response_text})
+            # 3. TTS → WAV bytes → AudioFrames
+            from av import AudioFrame
+            t_tts = time.time()
+            lang = detect_lang(response_text)
+            wav_bytes = await asyncio.to_thread(synthesize_speech, response_text, lang)
+            tts_ms = int((time.time() - t_tts) * 1000)
+            # Skip 44-byte WAV header, split PCM into 960-sample AudioFrames (60ms at 16kHz)
+            pcm = wav_bytes[44:]
+            frame_samples = 960
+            frame_bytes_sz = frame_samples * 2
+            pts_offset = 0
+            ttfa_ms = tts_ms  # gTTS has no streaming, first audio = after full TTS
+            for i in range(0, len(pcm), frame_bytes_sz):
+                sub = pcm[i:i + frame_bytes_sz]
+                if len(sub) < frame_bytes_sz:
+                    sub = sub + b'\x00' * (frame_bytes_sz - len(sub))
+                frame = AudioFrame(format='s16', layout='mono', samples=frame_samples)
+                frame.planes[0].update(sub)
+                frame.sample_rate = 16000
+                frame.pts = pts_offset
+                frame.time_base = fractions.Fraction(1, 16000)
+                await self._queue.put(frame)
+                pts_offset += frame_samples
+            await send_dc({
+                "status": "complete",
+                "transcript": transcript,
+                "response": response_text,
+                "timing": {
+                    "stt_ms": stt_ms,
+                    "llm_ms": llm_ms,
+                    "tts_ms": tts_ms,
+                    "ttfa_ms": ttfa_ms,
+                    "total_ms": int((time.time() - t_start) * 1000),
+                },
+            })
+        except Exception as e:
+            logger.error(f"[WebRTC] Pipeline error: {e}")
+            import traceback
+            traceback.print_exc()
+            await send_dc({"status": "error", "message": str(e)})
 SYSTEM_PROMPT = """Voce e um tutor de idiomas amigavel e paciente chamado Parle.
 Responda de forma concisa (1-3 frases) e adapte ao nivel do aluno.
 Se o aluno falar em portugues, responda em portugues.
     # Start idle watchdog
     asyncio.create_task(idle_watchdog())
+    # Initialize WebRTC relay (aiortc)
+    global _webrtc_relay, _webrtc_available
+    if _AIORTC_AVAILABLE:
+        _webrtc_relay = _MediaRelay()
+        _webrtc_available = True
+        logger.info("WebRTC (aiortc) ready — POST /api/offer active")
+    else:
+        logger.info("WebRTC disabled (aiortc not installed)")
     logger.info("All models ready!")
     yield
 async def capabilities():
     global last_activity
     last_activity = time.time()
+    protocols = ["sse", "websocket"]
+    if _webrtc_available:
+        protocols.append("webrtc")
     return {
         "pipeline": "light",
         "stt": {"model": "faster-whisper-small", "languages": ["auto"]},
         "llm": {"model": "qwen2.5-0.5b-instruct", "max_tokens": 256},
         "tts": {"model": "gtts", "languages": ["pt", "en", "es", "fr", "de", "it"]},
+        "protocols": protocols,
     }
             await ws.close()
         except Exception:
             pass
+# ── WebRTC Endpoints ─────────────────────────────────────────────────────────
+@app.post("/api/offer")
+async def api_webrtc_offer(request: Request):
+    """WebRTC SDP offer/answer exchange.
+    Client sends SDP offer -> server creates RTCPeerConnection, returns answer.
+    Audio pipeline: VAD → STT → LLM → TTS, result sent back via WebRTC audio track.
+    Status updates sent via DataChannel 'control'.
+    """
+    if not _webrtc_available:
+        return JSONResponse(
+            {"error": "WebRTC not available on this backend (aiortc not installed)"},
+            status_code=503,
+        )
+    global last_activity
+    last_activity = time.time()
+    params = await request.json()
+    offer = _RTCSessionDescription(sdp=params["sdp"], type=params["type"])
+    pc = _RTCPeerConnection(configuration=_build_rtc_config())
+    _webrtc_pcs.add(pc)
+    pc._data_channels = []
+    @pc.on("datachannel")
+    def on_datachannel(channel):
+        pc._data_channels.append(channel)
+        logger.info(f"[WebRTC] DataChannel opened: {channel.label}")
+    @pc.on("track")
+    def on_track(track):
+        logger.info(f"[WebRTC] Received track: {track.kind}")
+        if track.kind == "audio":
+            processor = AudioProcessTrack(_webrtc_relay.subscribe(track), pc)
+            pc.addTrack(processor)
+            asyncio.ensure_future(processor.start_processing())
+    @pc.on("connectionstatechange")
+    async def on_connectionstatechange():
+        logger.info(f"[WebRTC] Connection state: {pc.connectionState}")
+        if pc.connectionState in ("failed", "closed"):
+            await pc.close()
+            _webrtc_pcs.discard(pc)
+    await pc.setRemoteDescription(offer)
+    answer = await pc.createAnswer()
+    await pc.setLocalDescription(answer)
+    return JSONResponse({
+        "sdp": pc.localDescription.sdp,
+        "type": pc.localDescription.type,
+    })
+@app.get("/api/ice-servers")
+def api_ice_servers():
+    """Return ICE server config (STUN + optional TURN) for WebRTC clients."""
+    servers = [{"urls": ["stun:stun.l.google.com:19302"]}]
+    turn_url = os.environ.get("TURN_URL", "")
+    if turn_url:
+        servers.append({
+            "urls": [u.strip() for u in turn_url.split(",")],
+            "username": os.environ.get("TURN_USERNAME", ""),
+            "credential": os.environ.get("TURN_CREDENTIAL", ""),
+        })
+    return {"iceServers": servers}