File size: 6,048 Bytes
a2eaed6
38f4e7f
a2eaed6
9cd2409
38f4e7f
 
a2eaed6
 
 
 
 
 
 
 
 
 
38f4e7f
a2eaed6
 
 
 
 
9cd2409
a2eaed6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cd2409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2eaed6
 
 
 
 
 
9cd2409
a2eaed6
38f4e7f
a2eaed6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cd2409
 
38f4e7f
a2eaed6
 
 
 
 
 
 
38f4e7f
 
 
 
 
 
 
 
 
 
 
 
9cd2409
 
 
8c446ce
9cd2409
38f4e7f
9cd2409
 
 
38f4e7f
9cd2409
 
a2eaed6
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""
llm_engine.py — Hugging Face router chat with Ollama + local rules fallback.

Primary path: Hugging Face router chat completions.
Fallback 1: optional external Ollama endpoint.
Fallback 2: deterministic local receptionist rules.
"""

from __future__ import annotations

import logging
from typing import Any

import httpx

from config import Settings
import ollama_engine

logger = logging.getLogger(__name__)

DEFAULT_CHAT_FALLBACKS = [
    "meta-llama/Llama-3.1-8B-Instruct:cerebras",
    "openai/gpt-oss-120b:cerebras",
    "Qwen/Qwen3-4B-Thinking-2507:cerebras",
    "Qwen/Qwen2.5-7B-Instruct-1M:cerebras",
    "deepseek-ai/DeepSeek-R1:cerebras",
]


def candidate_models(settings: Settings) -> list[str]:
    models: list[str] = []
    if settings.hf_chat_model:
        models.append(settings.hf_chat_model)
    for model in DEFAULT_CHAT_FALLBACKS:
        if model not in models:
            models.append(model)
    return models


def _headers(settings: Settings) -> dict[str, str]:
    return {
        "Authorization": f"Bearer {settings.hf_token}",
        "Content-Type": "application/json",
    }


def _short_error(resp: httpx.Response) -> str:
    try:
        return str(resp.json())[:500]
    except Exception:
        return resp.text[:500]


def _last_user_text(messages: list[dict[str, Any]]) -> str:
    for message in reversed(messages):
        if message.get("role") == "user":
            return str(message.get("content", "")).strip()
    return ""


def local_receptionist_reply(messages: list[dict[str, Any]], business_name: str = "the business") -> str:
    """Small deterministic fallback for beta phone-call continuity."""
    user_text = _last_user_text(messages)
    lower = user_text.lower()

    if "lingo_ok" in lower or "say lingo_ok" in lower:
        return "LINGO_OK"

    if any(word in lower for word in ["human", "operator", "real person", "representative", "someone"]):
        return "Of course — let me connect you with a team member right away. Please hold for just a moment.\n[ESCALATE]"

    if any(word in lower for word in ["appointment", "book", "booking", "schedule", "reservation"]):
        return (
            "I can help take an appointment request. Please tell me your full name, preferred date and time, "
            "and the reason for the appointment. Someone from the team will confirm it shortly."
        )

    if any(word in lower for word in ["message", "call me", "call back", "voicemail"]):
        return (
            "I can take a message. Please share your name, the best callback number, "
            "and what you would like the team to know."
        )

    if any(word in lower for word in ["hours", "open", "close", "address", "price", "cost"]):
        return (
            f"I do not have that specific detail for {business_name} on hand yet. "
            "I can take a message and have someone follow up with you."
        )

    if not user_text:
        return "Hi, this is LINGO, your AI assistant. How can I help you today?"

    return (
        f"Thanks for calling {business_name}. I can answer basic questions, take a message, "
        "or collect an appointment request. How can I help?"
    )


async def chat_completion(
    messages: list[dict[str, Any]],
    http_client: httpx.AsyncClient,
    settings: Settings,
    max_tokens: int = 256,
    temperature: float = 0.5,
    allow_local_fallback: bool = True,
) -> dict[str, Any]:
    """Try HF chat models, then Ollama, then local beta rules if needed."""
    errors: list[dict[str, Any]] = []
    for model in candidate_models(settings):
        payload = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
        }
        try:
            resp = await http_client.post(
                settings.hf_chat_url,
                json=payload,
                headers=_headers(settings),
                timeout=60.0,
            )
            if not resp.is_success:
                errors.append({"model": model, "status_code": resp.status_code, "error": _short_error(resp)})
                continue
            data = resp.json()
            content = data["choices"][0]["message"]["content"]
            return {
                "ok": True,
                "source": "hf_provider",
                "provider_chat_ok": True,
                "ollama_ok": False,
                "model": model,
                "content": str(content).strip(),
                "raw_model": data.get("model"),
            }
        except Exception as exc:  # noqa: BLE001
            errors.append({"model": model, "exception": str(exc)[:500]})

    if allow_local_fallback and ollama_engine.enabled(settings):
        ollama_result = await ollama_engine.chat_completion(messages, http_client, settings)
        if ollama_result.get("ok"):
            return {
                **ollama_result,
                "provider_chat_ok": False,
                "ollama_ok": True,
                "provider_errors": errors,
                "warning": "HF provider chat failed; using external Ollama fallback.",
            }
        errors.append({"model": "ollama", "error": ollama_result})

    if allow_local_fallback:
        return {
            "ok": True,
            "source": "local_rules_fallback",
            "provider_chat_ok": False,
            "ollama_ok": False,
            "model": "local-receptionist-rules",
            "content": local_receptionist_reply(messages, settings.business_name),
            "provider_errors": errors,
            "warning": "HF provider chat failed; using local beta rules fallback.",
        }

    return {"ok": False, "errors": errors}


async def chat_text(
    messages: list[dict[str, Any]],
    http_client: httpx.AsyncClient,
    settings: Settings,
) -> str:
    result = await chat_completion(messages, http_client, settings)
    if result.get("ok"):
        return str(result["content"])
    raise RuntimeError(f"All chat models failed: {result.get('errors')}")