Spaces:

lucsanscartier
/

lingo-agent

Paused

App Files Files Community

lucsanscartier commited on Apr 26

Commit

6a6ca0b

verified ·

1 Parent(s): 13de188

Add local Kokoro TTS engine

Browse files

Files changed (1) hide show

tts_engine.py +112 -0

tts_engine.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+tts_engine.py — local + provider TTS for LINGO.
+Primary path: local Kokoro in the Space container. This avoids relying on
+Hugging Face Inference Providers for text-to-speech because the current
+hf-inference provider does not serve Kokoro TTS.
+Fallback path: remote HF/provider endpoint if configured.
+"""
+from __future__ import annotations
+import io
+import logging
+from typing import Any
+import httpx
+import numpy as np
+import soundfile as sf
+from config import Settings
+logger = logging.getLogger(__name__)
+_KOKORO_PIPELINE: Any | None = None
+def _get_kokoro_pipeline() -> Any:
+    global _KOKORO_PIPELINE
+    if _KOKORO_PIPELINE is None:
+        from kokoro import KPipeline
+        _KOKORO_PIPELINE = KPipeline(lang_code="a")
+        logger.info("Loaded local Kokoro TTS pipeline.")
+    return _KOKORO_PIPELINE
+def synthesize_local_kokoro_wav(text: str, voice: str = "af_heart") -> bytes:
+    """Return WAV bytes generated by local Kokoro."""
+    pipeline = _get_kokoro_pipeline()
+    generator = pipeline(text, voice=voice)
+    chunks: list[np.ndarray] = []
+    sample_rate = 24_000
+    for _graphemes, _phonemes, audio in generator:
+        arr = np.asarray(audio, dtype=np.float32)
+        if arr.ndim > 1:
+            arr = arr.mean(axis=1)
+        chunks.append(arr)
+    if not chunks:
+        raise RuntimeError("Local Kokoro returned no audio chunks")
+    audio_all = np.concatenate(chunks)
+    buf = io.BytesIO()
+    sf.write(buf, audio_all, sample_rate, format="WAV")
+    return buf.getvalue()
+async def synthesize_provider_wav(
+    text: str,
+    http_client: httpx.AsyncClient,
+    settings: Settings,
+) -> bytes:
+    """Attempt remote provider TTS using both known payload shapes."""
+    headers = {
+        "Authorization": f"Bearer {settings.hf_token}",
+        "Content-Type": "application/json",
+    }
+    payloads = [{"text_inputs": text}, {"inputs": text}]
+    last_exc: Exception | None = None
+    for payload in payloads:
+        try:
+            resp = await http_client.post(
+                settings.hf_tts_url,
+                json=payload,
+                headers=headers,
+                timeout=75.0,
+            )
+            if resp.status_code in {400, 404, 415, 422}:
+                last_exc = RuntimeError(
+                    f"TTS rejected payload {list(payload.keys())}: {resp.text[:200]}"
+                )
+                continue
+            resp.raise_for_status()
+            content_type = resp.headers.get("content-type", "")
+            if "application/json" in content_type:
+                raise RuntimeError(f"TTS returned JSON instead of audio: {resp.text[:300]}")
+            return resp.content
+        except (httpx.HTTPStatusError, httpx.RequestError, RuntimeError) as exc:
+            last_exc = exc
+    raise RuntimeError("Provider TTS failure") from last_exc
+async def synthesize_tts_wav(
+    text: str,
+    http_client: httpx.AsyncClient | None,
+    settings: Settings,
+) -> bytes:
+    """Synthesize TTS with local Kokoro first, provider fallback second."""
+    try:
+        return synthesize_local_kokoro_wav(text)
+    except Exception as local_exc:  # noqa: BLE001
+        logger.warning("Local Kokoro TTS failed: %s", local_exc)
+        if http_client is None:
+            raise RuntimeError("Local TTS failure and no provider client supplied") from local_exc
+        try:
+            return await synthesize_provider_wav(text, http_client, settings)
+        except Exception as provider_exc:  # noqa: BLE001
+            raise RuntimeError(f"TTS failure: local={local_exc}; provider={provider_exc}") from provider_exc