Spaces:
Paused
Paused
File size: 6,048 Bytes
a2eaed6 38f4e7f a2eaed6 9cd2409 38f4e7f a2eaed6 38f4e7f a2eaed6 9cd2409 a2eaed6 9cd2409 a2eaed6 9cd2409 a2eaed6 38f4e7f a2eaed6 9cd2409 38f4e7f a2eaed6 38f4e7f 9cd2409 8c446ce 9cd2409 38f4e7f 9cd2409 38f4e7f 9cd2409 a2eaed6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | """
llm_engine.py — Hugging Face router chat with Ollama + local rules fallback.
Primary path: Hugging Face router chat completions.
Fallback 1: optional external Ollama endpoint.
Fallback 2: deterministic local receptionist rules.
"""
from __future__ import annotations
import logging
from typing import Any
import httpx
from config import Settings
import ollama_engine
logger = logging.getLogger(__name__)
DEFAULT_CHAT_FALLBACKS = [
"meta-llama/Llama-3.1-8B-Instruct:cerebras",
"openai/gpt-oss-120b:cerebras",
"Qwen/Qwen3-4B-Thinking-2507:cerebras",
"Qwen/Qwen2.5-7B-Instruct-1M:cerebras",
"deepseek-ai/DeepSeek-R1:cerebras",
]
def candidate_models(settings: Settings) -> list[str]:
models: list[str] = []
if settings.hf_chat_model:
models.append(settings.hf_chat_model)
for model in DEFAULT_CHAT_FALLBACKS:
if model not in models:
models.append(model)
return models
def _headers(settings: Settings) -> dict[str, str]:
return {
"Authorization": f"Bearer {settings.hf_token}",
"Content-Type": "application/json",
}
def _short_error(resp: httpx.Response) -> str:
try:
return str(resp.json())[:500]
except Exception:
return resp.text[:500]
def _last_user_text(messages: list[dict[str, Any]]) -> str:
for message in reversed(messages):
if message.get("role") == "user":
return str(message.get("content", "")).strip()
return ""
def local_receptionist_reply(messages: list[dict[str, Any]], business_name: str = "the business") -> str:
"""Small deterministic fallback for beta phone-call continuity."""
user_text = _last_user_text(messages)
lower = user_text.lower()
if "lingo_ok" in lower or "say lingo_ok" in lower:
return "LINGO_OK"
if any(word in lower for word in ["human", "operator", "real person", "representative", "someone"]):
return "Of course — let me connect you with a team member right away. Please hold for just a moment.\n[ESCALATE]"
if any(word in lower for word in ["appointment", "book", "booking", "schedule", "reservation"]):
return (
"I can help take an appointment request. Please tell me your full name, preferred date and time, "
"and the reason for the appointment. Someone from the team will confirm it shortly."
)
if any(word in lower for word in ["message", "call me", "call back", "voicemail"]):
return (
"I can take a message. Please share your name, the best callback number, "
"and what you would like the team to know."
)
if any(word in lower for word in ["hours", "open", "close", "address", "price", "cost"]):
return (
f"I do not have that specific detail for {business_name} on hand yet. "
"I can take a message and have someone follow up with you."
)
if not user_text:
return "Hi, this is LINGO, your AI assistant. How can I help you today?"
return (
f"Thanks for calling {business_name}. I can answer basic questions, take a message, "
"or collect an appointment request. How can I help?"
)
async def chat_completion(
messages: list[dict[str, Any]],
http_client: httpx.AsyncClient,
settings: Settings,
max_tokens: int = 256,
temperature: float = 0.5,
allow_local_fallback: bool = True,
) -> dict[str, Any]:
"""Try HF chat models, then Ollama, then local beta rules if needed."""
errors: list[dict[str, Any]] = []
for model in candidate_models(settings):
payload = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
}
try:
resp = await http_client.post(
settings.hf_chat_url,
json=payload,
headers=_headers(settings),
timeout=60.0,
)
if not resp.is_success:
errors.append({"model": model, "status_code": resp.status_code, "error": _short_error(resp)})
continue
data = resp.json()
content = data["choices"][0]["message"]["content"]
return {
"ok": True,
"source": "hf_provider",
"provider_chat_ok": True,
"ollama_ok": False,
"model": model,
"content": str(content).strip(),
"raw_model": data.get("model"),
}
except Exception as exc: # noqa: BLE001
errors.append({"model": model, "exception": str(exc)[:500]})
if allow_local_fallback and ollama_engine.enabled(settings):
ollama_result = await ollama_engine.chat_completion(messages, http_client, settings)
if ollama_result.get("ok"):
return {
**ollama_result,
"provider_chat_ok": False,
"ollama_ok": True,
"provider_errors": errors,
"warning": "HF provider chat failed; using external Ollama fallback.",
}
errors.append({"model": "ollama", "error": ollama_result})
if allow_local_fallback:
return {
"ok": True,
"source": "local_rules_fallback",
"provider_chat_ok": False,
"ollama_ok": False,
"model": "local-receptionist-rules",
"content": local_receptionist_reply(messages, settings.business_name),
"provider_errors": errors,
"warning": "HF provider chat failed; using local beta rules fallback.",
}
return {"ok": False, "errors": errors}
async def chat_text(
messages: list[dict[str, Any]],
http_client: httpx.AsyncClient,
settings: Settings,
) -> str:
result = await chat_completion(messages, http_client, settings)
if result.get("ok"):
return str(result["content"])
raise RuntimeError(f"All chat models failed: {result.get('errors')}")
|