lucsanscartier commited on
Commit
6a6ca0b
·
verified ·
1 Parent(s): 13de188

Add local Kokoro TTS engine

Browse files
Files changed (1) hide show
  1. tts_engine.py +112 -0
tts_engine.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ tts_engine.py — local + provider TTS for LINGO.
3
+
4
+ Primary path: local Kokoro in the Space container. This avoids relying on
5
+ Hugging Face Inference Providers for text-to-speech because the current
6
+ hf-inference provider does not serve Kokoro TTS.
7
+
8
+ Fallback path: remote HF/provider endpoint if configured.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import io
14
+ import logging
15
+ from typing import Any
16
+
17
+ import httpx
18
+ import numpy as np
19
+ import soundfile as sf
20
+
21
+ from config import Settings
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ _KOKORO_PIPELINE: Any | None = None
26
+
27
+
28
+ def _get_kokoro_pipeline() -> Any:
29
+ global _KOKORO_PIPELINE
30
+ if _KOKORO_PIPELINE is None:
31
+ from kokoro import KPipeline
32
+
33
+ _KOKORO_PIPELINE = KPipeline(lang_code="a")
34
+ logger.info("Loaded local Kokoro TTS pipeline.")
35
+ return _KOKORO_PIPELINE
36
+
37
+
38
+ def synthesize_local_kokoro_wav(text: str, voice: str = "af_heart") -> bytes:
39
+ """Return WAV bytes generated by local Kokoro."""
40
+ pipeline = _get_kokoro_pipeline()
41
+ generator = pipeline(text, voice=voice)
42
+
43
+ chunks: list[np.ndarray] = []
44
+ sample_rate = 24_000
45
+ for _graphemes, _phonemes, audio in generator:
46
+ arr = np.asarray(audio, dtype=np.float32)
47
+ if arr.ndim > 1:
48
+ arr = arr.mean(axis=1)
49
+ chunks.append(arr)
50
+
51
+ if not chunks:
52
+ raise RuntimeError("Local Kokoro returned no audio chunks")
53
+
54
+ audio_all = np.concatenate(chunks)
55
+ buf = io.BytesIO()
56
+ sf.write(buf, audio_all, sample_rate, format="WAV")
57
+ return buf.getvalue()
58
+
59
+
60
+ async def synthesize_provider_wav(
61
+ text: str,
62
+ http_client: httpx.AsyncClient,
63
+ settings: Settings,
64
+ ) -> bytes:
65
+ """Attempt remote provider TTS using both known payload shapes."""
66
+ headers = {
67
+ "Authorization": f"Bearer {settings.hf_token}",
68
+ "Content-Type": "application/json",
69
+ }
70
+ payloads = [{"text_inputs": text}, {"inputs": text}]
71
+ last_exc: Exception | None = None
72
+
73
+ for payload in payloads:
74
+ try:
75
+ resp = await http_client.post(
76
+ settings.hf_tts_url,
77
+ json=payload,
78
+ headers=headers,
79
+ timeout=75.0,
80
+ )
81
+ if resp.status_code in {400, 404, 415, 422}:
82
+ last_exc = RuntimeError(
83
+ f"TTS rejected payload {list(payload.keys())}: {resp.text[:200]}"
84
+ )
85
+ continue
86
+ resp.raise_for_status()
87
+ content_type = resp.headers.get("content-type", "")
88
+ if "application/json" in content_type:
89
+ raise RuntimeError(f"TTS returned JSON instead of audio: {resp.text[:300]}")
90
+ return resp.content
91
+ except (httpx.HTTPStatusError, httpx.RequestError, RuntimeError) as exc:
92
+ last_exc = exc
93
+
94
+ raise RuntimeError("Provider TTS failure") from last_exc
95
+
96
+
97
+ async def synthesize_tts_wav(
98
+ text: str,
99
+ http_client: httpx.AsyncClient | None,
100
+ settings: Settings,
101
+ ) -> bytes:
102
+ """Synthesize TTS with local Kokoro first, provider fallback second."""
103
+ try:
104
+ return synthesize_local_kokoro_wav(text)
105
+ except Exception as local_exc: # noqa: BLE001
106
+ logger.warning("Local Kokoro TTS failed: %s", local_exc)
107
+ if http_client is None:
108
+ raise RuntimeError("Local TTS failure and no provider client supplied") from local_exc
109
+ try:
110
+ return await synthesize_provider_wav(text, http_client, settings)
111
+ except Exception as provider_exc: # noqa: BLE001
112
+ raise RuntimeError(f"TTS failure: local={local_exc}; provider={provider_exc}") from provider_exc