""" llm_engine.py — Hugging Face router chat with Ollama + local rules fallback. Primary path: Hugging Face router chat completions. Fallback 1: optional external Ollama endpoint. Fallback 2: deterministic local receptionist rules. """ from __future__ import annotations import logging from typing import Any import httpx from config import Settings import ollama_engine logger = logging.getLogger(__name__) DEFAULT_CHAT_FALLBACKS = [ "meta-llama/Llama-3.1-8B-Instruct:cerebras", "openai/gpt-oss-120b:cerebras", "Qwen/Qwen3-4B-Thinking-2507:cerebras", "Qwen/Qwen2.5-7B-Instruct-1M:cerebras", "deepseek-ai/DeepSeek-R1:cerebras", ] def candidate_models(settings: Settings) -> list[str]: models: list[str] = [] if settings.hf_chat_model: models.append(settings.hf_chat_model) for model in DEFAULT_CHAT_FALLBACKS: if model not in models: models.append(model) return models def _headers(settings: Settings) -> dict[str, str]: return { "Authorization": f"Bearer {settings.hf_token}", "Content-Type": "application/json", } def _short_error(resp: httpx.Response) -> str: try: return str(resp.json())[:500] except Exception: return resp.text[:500] def _last_user_text(messages: list[dict[str, Any]]) -> str: for message in reversed(messages): if message.get("role") == "user": return str(message.get("content", "")).strip() return "" def local_receptionist_reply(messages: list[dict[str, Any]], business_name: str = "the business") -> str: """Small deterministic fallback for beta phone-call continuity.""" user_text = _last_user_text(messages) lower = user_text.lower() if "lingo_ok" in lower or "say lingo_ok" in lower: return "LINGO_OK" if any(word in lower for word in ["human", "operator", "real person", "representative", "someone"]): return "Of course — let me connect you with a team member right away. Please hold for just a moment.\n[ESCALATE]" if any(word in lower for word in ["appointment", "book", "booking", "schedule", "reservation"]): return ( "I can help take an appointment request. Please tell me your full name, preferred date and time, " "and the reason for the appointment. Someone from the team will confirm it shortly." ) if any(word in lower for word in ["message", "call me", "call back", "voicemail"]): return ( "I can take a message. Please share your name, the best callback number, " "and what you would like the team to know." ) if any(word in lower for word in ["hours", "open", "close", "address", "price", "cost"]): return ( f"I do not have that specific detail for {business_name} on hand yet. " "I can take a message and have someone follow up with you." ) if not user_text: return "Hi, this is LINGO, your AI assistant. How can I help you today?" return ( f"Thanks for calling {business_name}. I can answer basic questions, take a message, " "or collect an appointment request. How can I help?" ) async def chat_completion( messages: list[dict[str, Any]], http_client: httpx.AsyncClient, settings: Settings, max_tokens: int = 256, temperature: float = 0.5, allow_local_fallback: bool = True, ) -> dict[str, Any]: """Try HF chat models, then Ollama, then local beta rules if needed.""" errors: list[dict[str, Any]] = [] for model in candidate_models(settings): payload = { "model": model, "messages": messages, "max_tokens": max_tokens, "temperature": temperature, } try: resp = await http_client.post( settings.hf_chat_url, json=payload, headers=_headers(settings), timeout=60.0, ) if not resp.is_success: errors.append({"model": model, "status_code": resp.status_code, "error": _short_error(resp)}) continue data = resp.json() content = data["choices"][0]["message"]["content"] return { "ok": True, "source": "hf_provider", "provider_chat_ok": True, "ollama_ok": False, "model": model, "content": str(content).strip(), "raw_model": data.get("model"), } except Exception as exc: # noqa: BLE001 errors.append({"model": model, "exception": str(exc)[:500]}) if allow_local_fallback and ollama_engine.enabled(settings): ollama_result = await ollama_engine.chat_completion(messages, http_client, settings) if ollama_result.get("ok"): return { **ollama_result, "provider_chat_ok": False, "ollama_ok": True, "provider_errors": errors, "warning": "HF provider chat failed; using external Ollama fallback.", } errors.append({"model": "ollama", "error": ollama_result}) if allow_local_fallback: return { "ok": True, "source": "local_rules_fallback", "provider_chat_ok": False, "ollama_ok": False, "model": "local-receptionist-rules", "content": local_receptionist_reply(messages, settings.business_name), "provider_errors": errors, "warning": "HF provider chat failed; using local beta rules fallback.", } return {"ok": False, "errors": errors} async def chat_text( messages: list[dict[str, Any]], http_client: httpx.AsyncClient, settings: Settings, ) -> str: result = await chat_completion(messages, http_client, settings) if result.get("ok"): return str(result["content"]) raise RuntimeError(f"All chat models failed: {result.get('errors')}")