lingo-agent / llm_engine.py
lucsanscartier's picture
Wire Ollama into chat fallback chain
38f4e7f verified
"""
llm_engine.py — Hugging Face router chat with Ollama + local rules fallback.
Primary path: Hugging Face router chat completions.
Fallback 1: optional external Ollama endpoint.
Fallback 2: deterministic local receptionist rules.
"""
from __future__ import annotations
import logging
from typing import Any
import httpx
from config import Settings
import ollama_engine
logger = logging.getLogger(__name__)
DEFAULT_CHAT_FALLBACKS = [
"meta-llama/Llama-3.1-8B-Instruct:cerebras",
"openai/gpt-oss-120b:cerebras",
"Qwen/Qwen3-4B-Thinking-2507:cerebras",
"Qwen/Qwen2.5-7B-Instruct-1M:cerebras",
"deepseek-ai/DeepSeek-R1:cerebras",
]
def candidate_models(settings: Settings) -> list[str]:
models: list[str] = []
if settings.hf_chat_model:
models.append(settings.hf_chat_model)
for model in DEFAULT_CHAT_FALLBACKS:
if model not in models:
models.append(model)
return models
def _headers(settings: Settings) -> dict[str, str]:
return {
"Authorization": f"Bearer {settings.hf_token}",
"Content-Type": "application/json",
}
def _short_error(resp: httpx.Response) -> str:
try:
return str(resp.json())[:500]
except Exception:
return resp.text[:500]
def _last_user_text(messages: list[dict[str, Any]]) -> str:
for message in reversed(messages):
if message.get("role") == "user":
return str(message.get("content", "")).strip()
return ""
def local_receptionist_reply(messages: list[dict[str, Any]], business_name: str = "the business") -> str:
"""Small deterministic fallback for beta phone-call continuity."""
user_text = _last_user_text(messages)
lower = user_text.lower()
if "lingo_ok" in lower or "say lingo_ok" in lower:
return "LINGO_OK"
if any(word in lower for word in ["human", "operator", "real person", "representative", "someone"]):
return "Of course — let me connect you with a team member right away. Please hold for just a moment.\n[ESCALATE]"
if any(word in lower for word in ["appointment", "book", "booking", "schedule", "reservation"]):
return (
"I can help take an appointment request. Please tell me your full name, preferred date and time, "
"and the reason for the appointment. Someone from the team will confirm it shortly."
)
if any(word in lower for word in ["message", "call me", "call back", "voicemail"]):
return (
"I can take a message. Please share your name, the best callback number, "
"and what you would like the team to know."
)
if any(word in lower for word in ["hours", "open", "close", "address", "price", "cost"]):
return (
f"I do not have that specific detail for {business_name} on hand yet. "
"I can take a message and have someone follow up with you."
)
if not user_text:
return "Hi, this is LINGO, your AI assistant. How can I help you today?"
return (
f"Thanks for calling {business_name}. I can answer basic questions, take a message, "
"or collect an appointment request. How can I help?"
)
async def chat_completion(
messages: list[dict[str, Any]],
http_client: httpx.AsyncClient,
settings: Settings,
max_tokens: int = 256,
temperature: float = 0.5,
allow_local_fallback: bool = True,
) -> dict[str, Any]:
"""Try HF chat models, then Ollama, then local beta rules if needed."""
errors: list[dict[str, Any]] = []
for model in candidate_models(settings):
payload = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
}
try:
resp = await http_client.post(
settings.hf_chat_url,
json=payload,
headers=_headers(settings),
timeout=60.0,
)
if not resp.is_success:
errors.append({"model": model, "status_code": resp.status_code, "error": _short_error(resp)})
continue
data = resp.json()
content = data["choices"][0]["message"]["content"]
return {
"ok": True,
"source": "hf_provider",
"provider_chat_ok": True,
"ollama_ok": False,
"model": model,
"content": str(content).strip(),
"raw_model": data.get("model"),
}
except Exception as exc: # noqa: BLE001
errors.append({"model": model, "exception": str(exc)[:500]})
if allow_local_fallback and ollama_engine.enabled(settings):
ollama_result = await ollama_engine.chat_completion(messages, http_client, settings)
if ollama_result.get("ok"):
return {
**ollama_result,
"provider_chat_ok": False,
"ollama_ok": True,
"provider_errors": errors,
"warning": "HF provider chat failed; using external Ollama fallback.",
}
errors.append({"model": "ollama", "error": ollama_result})
if allow_local_fallback:
return {
"ok": True,
"source": "local_rules_fallback",
"provider_chat_ok": False,
"ollama_ok": False,
"model": "local-receptionist-rules",
"content": local_receptionist_reply(messages, settings.business_name),
"provider_errors": errors,
"warning": "HF provider chat failed; using local beta rules fallback.",
}
return {"ok": False, "errors": errors}
async def chat_text(
messages: list[dict[str, Any]],
http_client: httpx.AsyncClient,
settings: Settings,
) -> str:
result = await chat_completion(messages, http_client, settings)
if result.get("ok"):
return str(result["content"])
raise RuntimeError(f"All chat models failed: {result.get('errors')}")