Spaces:

lucsanscartier
/

lingo-agent

Paused

File size: 6,048 Bytes

"""
llm_engine.py — Hugging Face router chat with Ollama + local rules fallback.

Primary path: Hugging Face router chat completions.
Fallback 1: optional external Ollama endpoint.
Fallback 2: deterministic local receptionist rules.
"""

from __future__ import annotations

import logging
from typing import Any

import httpx

from config import Settings
import ollama_engine

logger = logging.getLogger(__name__)

DEFAULT_CHAT_FALLBACKS = [
    "meta-llama/Llama-3.1-8B-Instruct:cerebras",
    "openai/gpt-oss-120b:cerebras",
    "Qwen/Qwen3-4B-Thinking-2507:cerebras",
    "Qwen/Qwen2.5-7B-Instruct-1M:cerebras",
    "deepseek-ai/DeepSeek-R1:cerebras",
]


def candidate_models(settings: Settings) -> list[str]:
    models: list[str] = []
    if settings.hf_chat_model:
        models.append(settings.hf_chat_model)
    for model in DEFAULT_CHAT_FALLBACKS:
        if model not in models:
            models.append(model)
    return models


def _headers(settings: Settings) -> dict[str, str]:
    return {
        "Authorization": f"Bearer {settings.hf_token}",
        "Content-Type": "application/json",
    }


def _short_error(resp: httpx.Response) -> str:
    try:
        return str(resp.json())[:500]
    except Exception:
        return resp.text[:500]


def _last_user_text(messages: list[dict[str, Any]]) -> str:
    for message in reversed(messages):
        if message.get("role") == "user":
            return str(message.get("content", "")).strip()
    return ""


def local_receptionist_reply(messages: list[dict[str, Any]], business_name: str = "the business") -> str:
    """Small deterministic fallback for beta phone-call continuity."""
    user_text = _last_user_text(messages)
    lower = user_text.lower()

    if "lingo_ok" in lower or "say lingo_ok" in lower:
        return "LINGO_OK"

    if any(word in lower for word in ["human", "operator", "real person", "representative", "someone"]):
        return "Of course — let me connect you with a team member right away. Please hold for just a moment.\n[ESCALATE]"

    if any(word in lower for word in ["appointment", "book", "booking", "schedule", "reservation"]):
        return (
            "I can help take an appointment request. Please tell me your full name, preferred date and time, "
            "and the reason for the appointment. Someone from the team will confirm it shortly."
        )

    if any(word in lower for word in ["message", "call me", "call back", "voicemail"]):
        return (
            "I can take a message. Please share your name, the best callback number, "
            "and what you would like the team to know."
        )

    if any(word in lower for word in ["hours", "open", "close", "address", "price", "cost"]):
        return (
            f"I do not have that specific detail for {business_name} on hand yet. "
            "I can take a message and have someone follow up with you."
        )

    if not user_text:
        return "Hi, this is LINGO, your AI assistant. How can I help you today?"

    return (
        f"Thanks for calling {business_name}. I can answer basic questions, take a message, "
        "or collect an appointment request. How can I help?"
    )


async def chat_completion(
    messages: list[dict[str, Any]],
    http_client: httpx.AsyncClient,
    settings: Settings,
    max_tokens: int = 256,
    temperature: float = 0.5,
    allow_local_fallback: bool = True,
) -> dict[str, Any]:
    """Try HF chat models, then Ollama, then local beta rules if needed."""
    errors: list[dict[str, Any]] = []
    for model in candidate_models(settings):
        payload = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
        }
        try:
            resp = await http_client.post(
                settings.hf_chat_url,
                json=payload,
                headers=_headers(settings),
                timeout=60.0,
            )
            if not resp.is_success:
                errors.append({"model": model, "status_code": resp.status_code, "error": _short_error(resp)})
                continue
            data = resp.json()
            content = data["choices"][0]["message"]["content"]
            return {
                "ok": True,
                "source": "hf_provider",
                "provider_chat_ok": True,
                "ollama_ok": False,
                "model": model,
                "content": str(content).strip(),
                "raw_model": data.get("model"),
            }
        except Exception as exc:  # noqa: BLE001
            errors.append({"model": model, "exception": str(exc)[:500]})

    if allow_local_fallback and ollama_engine.enabled(settings):
        ollama_result = await ollama_engine.chat_completion(messages, http_client, settings)
        if ollama_result.get("ok"):
            return {
                **ollama_result,
                "provider_chat_ok": False,
                "ollama_ok": True,
                "provider_errors": errors,
                "warning": "HF provider chat failed; using external Ollama fallback.",
            }
        errors.append({"model": "ollama", "error": ollama_result})

    if allow_local_fallback:
        return {
            "ok": True,
            "source": "local_rules_fallback",
            "provider_chat_ok": False,
            "ollama_ok": False,
            "model": "local-receptionist-rules",
            "content": local_receptionist_reply(messages, settings.business_name),
            "provider_errors": errors,
            "warning": "HF provider chat failed; using local beta rules fallback.",
        }

    return {"ok": False, "errors": errors}


async def chat_text(
    messages: list[dict[str, Any]],
    http_client: httpx.AsyncClient,
    settings: Settings,
) -> str:
    result = await chat_completion(messages, http_client, settings)
    if result.get("ok"):
        return str(result["content"])
    raise RuntimeError(f"All chat models failed: {result.get('errors')}")