lucsanscartier commited on
Commit
a2eaed6
·
verified ·
1 Parent(s): 6a6ca0b

Add Hugging Face chat fallback engine

Browse files
Files changed (1) hide show
  1. llm_engine.py +101 -0
llm_engine.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ llm_engine.py — Hugging Face router chat with provider/model fallbacks.
3
+
4
+ The router only serves models that a provider currently supports. The app tries
5
+ configured HF_CHAT_MODEL first, then stable fallbacks from the HF chat-completion
6
+ provider docs.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from typing import Any
13
+
14
+ import httpx
15
+
16
+ from config import Settings
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ DEFAULT_CHAT_FALLBACKS = [
21
+ "meta-llama/Llama-3.1-8B-Instruct:cerebras",
22
+ "Qwen/Qwen3-4B-Thinking-2507:cerebras",
23
+ "Qwen/Qwen2.5-7B-Instruct-1M:cerebras",
24
+ "deepseek-ai/DeepSeek-R1:cerebras",
25
+ "openai/gpt-oss-120b:cerebras",
26
+ ]
27
+
28
+
29
+ def candidate_models(settings: Settings) -> list[str]:
30
+ models: list[str] = []
31
+ if settings.hf_chat_model:
32
+ models.append(settings.hf_chat_model)
33
+ for model in DEFAULT_CHAT_FALLBACKS:
34
+ if model not in models:
35
+ models.append(model)
36
+ return models
37
+
38
+
39
+ def _headers(settings: Settings) -> dict[str, str]:
40
+ return {
41
+ "Authorization": f"Bearer {settings.hf_token}",
42
+ "Content-Type": "application/json",
43
+ }
44
+
45
+
46
+ def _short_error(resp: httpx.Response) -> str:
47
+ try:
48
+ return str(resp.json())[:500]
49
+ except Exception:
50
+ return resp.text[:500]
51
+
52
+
53
+ async def chat_completion(
54
+ messages: list[dict[str, Any]],
55
+ http_client: httpx.AsyncClient,
56
+ settings: Settings,
57
+ max_tokens: int = 256,
58
+ temperature: float = 0.5,
59
+ ) -> dict[str, Any]:
60
+ """Try chat models until one succeeds, returning model + content."""
61
+ errors: list[dict[str, Any]] = []
62
+ for model in candidate_models(settings):
63
+ payload = {
64
+ "model": model,
65
+ "messages": messages,
66
+ "max_tokens": max_tokens,
67
+ "temperature": temperature,
68
+ }
69
+ try:
70
+ resp = await http_client.post(
71
+ settings.hf_chat_url,
72
+ json=payload,
73
+ headers=_headers(settings),
74
+ timeout=60.0,
75
+ )
76
+ if not resp.is_success:
77
+ errors.append({"model": model, "status_code": resp.status_code, "error": _short_error(resp)})
78
+ continue
79
+ data = resp.json()
80
+ content = data["choices"][0]["message"]["content"]
81
+ return {
82
+ "ok": True,
83
+ "model": model,
84
+ "content": str(content).strip(),
85
+ "raw_model": data.get("model"),
86
+ }
87
+ except Exception as exc: # noqa: BLE001
88
+ errors.append({"model": model, "exception": str(exc)[:500]})
89
+
90
+ return {"ok": False, "errors": errors}
91
+
92
+
93
+ async def chat_text(
94
+ messages: list[dict[str, Any]],
95
+ http_client: httpx.AsyncClient,
96
+ settings: Settings,
97
+ ) -> str:
98
+ result = await chat_completion(messages, http_client, settings)
99
+ if result.get("ok"):
100
+ return str(result["content"])
101
+ raise RuntimeError(f"All chat models failed: {result.get('errors')}")