Spaces:

lucsanscartier
/

lingo-agent

Paused

App Files Files Community

lucsanscartier commited on 17 days ago

Commit

a2eaed6

verified ·

1 Parent(s): 6a6ca0b

Add Hugging Face chat fallback engine

Browse files

Files changed (1) hide show

llm_engine.py +101 -0

llm_engine.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+llm_engine.py — Hugging Face router chat with provider/model fallbacks.
+The router only serves models that a provider currently supports. The app tries
+configured HF_CHAT_MODEL first, then stable fallbacks from the HF chat-completion
+provider docs.
+"""
+from __future__ import annotations
+import logging
+from typing import Any
+import httpx
+from config import Settings
+logger = logging.getLogger(__name__)
+DEFAULT_CHAT_FALLBACKS = [
+    "meta-llama/Llama-3.1-8B-Instruct:cerebras",
+    "Qwen/Qwen3-4B-Thinking-2507:cerebras",
+    "Qwen/Qwen2.5-7B-Instruct-1M:cerebras",
+    "deepseek-ai/DeepSeek-R1:cerebras",
+    "openai/gpt-oss-120b:cerebras",
+]
+def candidate_models(settings: Settings) -> list[str]:
+    models: list[str] = []
+    if settings.hf_chat_model:
+        models.append(settings.hf_chat_model)
+    for model in DEFAULT_CHAT_FALLBACKS:
+        if model not in models:
+            models.append(model)
+    return models
+def _headers(settings: Settings) -> dict[str, str]:
+    return {
+        "Authorization": f"Bearer {settings.hf_token}",
+        "Content-Type": "application/json",
+    }
+def _short_error(resp: httpx.Response) -> str:
+    try:
+        return str(resp.json())[:500]
+    except Exception:
+        return resp.text[:500]
+async def chat_completion(
+    messages: list[dict[str, Any]],
+    http_client: httpx.AsyncClient,
+    settings: Settings,
+    max_tokens: int = 256,
+    temperature: float = 0.5,
+) -> dict[str, Any]:
+    """Try chat models until one succeeds, returning model + content."""
+    errors: list[dict[str, Any]] = []
+    for model in candidate_models(settings):
+        payload = {
+            "model": model,
+            "messages": messages,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+        }
+        try:
+            resp = await http_client.post(
+                settings.hf_chat_url,
+                json=payload,
+                headers=_headers(settings),
+                timeout=60.0,
+            )
+            if not resp.is_success:
+                errors.append({"model": model, "status_code": resp.status_code, "error": _short_error(resp)})
+                continue
+            data = resp.json()
+            content = data["choices"][0]["message"]["content"]
+            return {
+                "ok": True,
+                "model": model,
+                "content": str(content).strip(),
+                "raw_model": data.get("model"),
+            }
+        except Exception as exc:  # noqa: BLE001
+            errors.append({"model": model, "exception": str(exc)[:500]})
+    return {"ok": False, "errors": errors}
+async def chat_text(
+    messages: list[dict[str, Any]],
+    http_client: httpx.AsyncClient,
+    settings: Settings,
+) -> str:
+    result = await chat_completion(messages, http_client, settings)
+    if result.get("ok"):
+        return str(result["content"])
+    raise RuntimeError(f"All chat models failed: {result.get('errors')}")