Spaces:
Paused
Paused
| """ | |
| llm_engine.py — Hugging Face router chat with Ollama + local rules fallback. | |
| Primary path: Hugging Face router chat completions. | |
| Fallback 1: optional external Ollama endpoint. | |
| Fallback 2: deterministic local receptionist rules. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from typing import Any | |
| import httpx | |
| from config import Settings | |
| import ollama_engine | |
| logger = logging.getLogger(__name__) | |
| DEFAULT_CHAT_FALLBACKS = [ | |
| "meta-llama/Llama-3.1-8B-Instruct:cerebras", | |
| "openai/gpt-oss-120b:cerebras", | |
| "Qwen/Qwen3-4B-Thinking-2507:cerebras", | |
| "Qwen/Qwen2.5-7B-Instruct-1M:cerebras", | |
| "deepseek-ai/DeepSeek-R1:cerebras", | |
| ] | |
| def candidate_models(settings: Settings) -> list[str]: | |
| models: list[str] = [] | |
| if settings.hf_chat_model: | |
| models.append(settings.hf_chat_model) | |
| for model in DEFAULT_CHAT_FALLBACKS: | |
| if model not in models: | |
| models.append(model) | |
| return models | |
| def _headers(settings: Settings) -> dict[str, str]: | |
| return { | |
| "Authorization": f"Bearer {settings.hf_token}", | |
| "Content-Type": "application/json", | |
| } | |
| def _short_error(resp: httpx.Response) -> str: | |
| try: | |
| return str(resp.json())[:500] | |
| except Exception: | |
| return resp.text[:500] | |
| def _last_user_text(messages: list[dict[str, Any]]) -> str: | |
| for message in reversed(messages): | |
| if message.get("role") == "user": | |
| return str(message.get("content", "")).strip() | |
| return "" | |
| def local_receptionist_reply(messages: list[dict[str, Any]], business_name: str = "the business") -> str: | |
| """Small deterministic fallback for beta phone-call continuity.""" | |
| user_text = _last_user_text(messages) | |
| lower = user_text.lower() | |
| if "lingo_ok" in lower or "say lingo_ok" in lower: | |
| return "LINGO_OK" | |
| if any(word in lower for word in ["human", "operator", "real person", "representative", "someone"]): | |
| return "Of course — let me connect you with a team member right away. Please hold for just a moment.\n[ESCALATE]" | |
| if any(word in lower for word in ["appointment", "book", "booking", "schedule", "reservation"]): | |
| return ( | |
| "I can help take an appointment request. Please tell me your full name, preferred date and time, " | |
| "and the reason for the appointment. Someone from the team will confirm it shortly." | |
| ) | |
| if any(word in lower for word in ["message", "call me", "call back", "voicemail"]): | |
| return ( | |
| "I can take a message. Please share your name, the best callback number, " | |
| "and what you would like the team to know." | |
| ) | |
| if any(word in lower for word in ["hours", "open", "close", "address", "price", "cost"]): | |
| return ( | |
| f"I do not have that specific detail for {business_name} on hand yet. " | |
| "I can take a message and have someone follow up with you." | |
| ) | |
| if not user_text: | |
| return "Hi, this is LINGO, your AI assistant. How can I help you today?" | |
| return ( | |
| f"Thanks for calling {business_name}. I can answer basic questions, take a message, " | |
| "or collect an appointment request. How can I help?" | |
| ) | |
| async def chat_completion( | |
| messages: list[dict[str, Any]], | |
| http_client: httpx.AsyncClient, | |
| settings: Settings, | |
| max_tokens: int = 256, | |
| temperature: float = 0.5, | |
| allow_local_fallback: bool = True, | |
| ) -> dict[str, Any]: | |
| """Try HF chat models, then Ollama, then local beta rules if needed.""" | |
| errors: list[dict[str, Any]] = [] | |
| for model in candidate_models(settings): | |
| payload = { | |
| "model": model, | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| } | |
| try: | |
| resp = await http_client.post( | |
| settings.hf_chat_url, | |
| json=payload, | |
| headers=_headers(settings), | |
| timeout=60.0, | |
| ) | |
| if not resp.is_success: | |
| errors.append({"model": model, "status_code": resp.status_code, "error": _short_error(resp)}) | |
| continue | |
| data = resp.json() | |
| content = data["choices"][0]["message"]["content"] | |
| return { | |
| "ok": True, | |
| "source": "hf_provider", | |
| "provider_chat_ok": True, | |
| "ollama_ok": False, | |
| "model": model, | |
| "content": str(content).strip(), | |
| "raw_model": data.get("model"), | |
| } | |
| except Exception as exc: # noqa: BLE001 | |
| errors.append({"model": model, "exception": str(exc)[:500]}) | |
| if allow_local_fallback and ollama_engine.enabled(settings): | |
| ollama_result = await ollama_engine.chat_completion(messages, http_client, settings) | |
| if ollama_result.get("ok"): | |
| return { | |
| **ollama_result, | |
| "provider_chat_ok": False, | |
| "ollama_ok": True, | |
| "provider_errors": errors, | |
| "warning": "HF provider chat failed; using external Ollama fallback.", | |
| } | |
| errors.append({"model": "ollama", "error": ollama_result}) | |
| if allow_local_fallback: | |
| return { | |
| "ok": True, | |
| "source": "local_rules_fallback", | |
| "provider_chat_ok": False, | |
| "ollama_ok": False, | |
| "model": "local-receptionist-rules", | |
| "content": local_receptionist_reply(messages, settings.business_name), | |
| "provider_errors": errors, | |
| "warning": "HF provider chat failed; using local beta rules fallback.", | |
| } | |
| return {"ok": False, "errors": errors} | |
| async def chat_text( | |
| messages: list[dict[str, Any]], | |
| http_client: httpx.AsyncClient, | |
| settings: Settings, | |
| ) -> str: | |
| result = await chat_completion(messages, http_client, settings) | |
| if result.get("ok"): | |
| return str(result["content"]) | |
| raise RuntimeError(f"All chat models failed: {result.get('errors')}") | |