Spaces:

MonumentalSystems
/

randygpt-s-space

Running

App Files Files Community

rsafier commited on Feb 19

Commit

a1df84d

verified ·

1 Parent(s): b0ac406

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

Dockerfile +17 -0
README.md +45 -4
app.py +254 -0
requirements.txt +6 -0
server.py +403 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.11-slim
+# HuggingFace Spaces requires user ID 1000
+RUN useradd -m -u 1000 user
+WORKDIR /home/user/app
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY --chown=user server.py .
+USER user
+ENV HOME=/home/user
+EXPOSE 7860
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,51 @@
 ---
-title: Randygpt S Space
-emoji: ⚡
 colorFrom: blue
-colorTo: yellow
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: randyGPT
+emoji: 📖
 colorFrom: blue
+colorTo: indigo
 sdk: docker
 pinned: false
 ---
+# randyGPT — OpenAI-compatible API
+A GPT trained from scratch in Rust on 114 Project Gutenberg books.
+Model weights load from [MonumentalSystems/randygpt-s](https://huggingface.co/MonumentalSystems/randygpt-s).
+## Endpoints
+| Method | Path | Description |
+|--------|------|-------------|
+| GET | `/v1/models` | List available models |
+| POST | `/v1/chat/completions` | Generate text (OpenAI-compatible) |
+## Usage
+```bash
+curl https://monumentalsystems-randygpt-space.hf.space/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "model": "randygpt-s",
+    "messages": [{"role": "user", "content": "Once upon a time"}],
+    "max_tokens": 200,
+    "temperature": 0.8
+  }'
+```
+### OpenAI SDK
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="https://monumentalsystems-randygpt-space.hf.space/v1",
+    api_key="none",
+)
+response = client.chat.completions.create(
+    model="randygpt-s",
+    messages=[{"role": "user", "content": "Once upon a time"}],
+    max_tokens=200,
+)
+print(response.choices[0].message.content)
+```

app.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+app.py — randyGPT HuggingFace Space
+Loads model weights from the Hub; HF hosts the compute.
+Repo: MonumentalSystems/randygpt-s
+"""
+import json
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import gradio as gr
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+# ── Inline model definition (no external import needed in the Space) ──────────
+class RandyGPTConfig:
+    def __init__(self, **kw):
+        self.vocab_size = kw.get("vocab_size", 1500)
+        self.n_embd     = kw.get("n_embd",     128)
+        self.n_head     = kw.get("n_head",     4)
+        self.n_layer    = kw.get("n_layer",    8)
+        self.block_size = kw.get("block_size", 256)
+        self.head_dim   = self.n_embd // self.n_head
+        self.mlp_dim    = 4 * self.n_embd
+    @classmethod
+    def from_json(cls, path):
+        d = json.loads(Path(path).read_text())
+        return cls(**d)
+def rmsnorm(x, eps=1e-5):
+    return x * (x.pow(2).mean(-1, keepdim=True) + eps).rsqrt()
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.n_head = cfg.n_head
+        self.head_dim = cfg.head_dim
+        self.scale = 1.0 / math.sqrt(cfg.head_dim)
+        self.wq = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.wk = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.wv = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+        self.wo = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
+    def forward(self, x):
+        B, T, C = x.shape
+        q = self.wq(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.wk(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = self.wv(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        scores = q @ k.transpose(-2, -1) * self.scale
+        mask = torch.full((T, T), float('-inf'), device=x.device).triu(1)
+        attn = F.softmax(scores + mask, dim=-1)
+        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, C)
+        return self.wo(out)
+class MLP(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.fc1 = nn.Linear(cfg.n_embd, cfg.mlp_dim, bias=False)
+        self.fc2 = nn.Linear(cfg.mlp_dim, cfg.n_embd, bias=False)
+    def forward(self, x):
+        return self.fc2(F.relu(self.fc1(x)).pow(2))
+class TransformerBlock(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.attn = CausalSelfAttention(cfg)
+        self.mlp  = MLP(cfg)
+    def forward(self, x):
+        x = x + self.attn(rmsnorm(x))
+        x = x + self.mlp(rmsnorm(x))
+        return x
+class RandyGPT(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg     = cfg
+        self.wte     = nn.Embedding(cfg.vocab_size, cfg.n_embd)
+        self.wpe     = nn.Embedding(cfg.block_size, cfg.n_embd)
+        self.layers  = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layer)])
+        self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
+    def forward(self, ids):
+        B, T = ids.shape
+        pos = torch.arange(T, device=ids.device).unsqueeze(0)
+        x = self.wte(ids) + self.wpe(pos)
+        for block in self.layers:
+            x = block(x)
+        return self.lm_head(x)
+    @torch.no_grad()
+    def generate(self, ids, max_new_tokens=200, temperature=0.8, top_p=0.9):
+        self.eval()
+        for _ in range(max_new_tokens):
+            ctx    = ids[:, -self.cfg.block_size:]
+            logits = self(ctx)[:, -1, :] / temperature
+            probs  = F.softmax(logits, dim=-1)
+            sp, si = torch.sort(probs, descending=True)
+            cum    = sp.cumsum(-1)
+            sp[cum - sp > top_p] = 0.0
+            sp /= sp.sum()
+            nxt = si[0, torch.multinomial(sp[0], 1)]
+            ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
+            if nxt.item() == 1:   # <|eos|>
+                break
+        return ids
+# ── Tokenizer ─────────────────────────────────────────────────────────────────
+class Tokenizer:
+    def __init__(self, vocab, merges):
+        self.vocab = vocab
+        self.t2i   = {s: i for i, s in enumerate(vocab)}
+        self.bos   = self.t2i.get("<|bos|>", 0)
+        self.eos   = self.t2i.get("<|eos|>", 1)
+        self.merge_map = {}
+        for left, right in merges:
+            l, r, m = self.t2i.get(left), self.t2i.get(right), self.t2i.get(left + right)
+            if l is not None and r is not None and m is not None:
+                self.merge_map.setdefault((l, r), m)
+    @classmethod
+    def from_json(cls, path):
+        d = json.loads(Path(path).read_text(encoding="utf-8"))
+        return cls(d["vocab"], [tuple(m) for m in d["merges"]])
+    def _encode_chunk(self, text):
+        tokens = [self.t2i[c] for c in text if c in self.t2i]
+        if len(tokens) < 2:
+            return tokens
+        while True:
+            best = None
+            for i in range(len(tokens) - 1):
+                m = self.merge_map.get((tokens[i], tokens[i+1]))
+                if m is not None and (best is None or m < best):
+                    best = m
+            if best is None:
+                break
+            out, i = [], 0
+            while i < len(tokens):
+                if i+1 < len(tokens) and self.merge_map.get((tokens[i], tokens[i+1])) == best:
+                    out.append(best); i += 2
+                else:
+                    out.append(tokens[i]); i += 1
+            tokens = out
+        return tokens
+    def encode(self, text):
+        nl = self.t2i.get("\n")
+        lines, result = text.split("\n"), []
+        for i, line in enumerate(lines):
+            result.extend(self._encode_chunk(line))
+            if i < len(lines) - 1 and nl is not None:
+                result.append(nl)
+        return result
+    def decode(self, ids):
+        return "".join(self.vocab[i] for i in ids
+                       if i not in (self.bos, self.eos) and 0 <= i < len(self.vocab))
+# ── Load model once at startup ────────────────────────────────────────────────
+REPO = "MonumentalSystems/randygpt-s"
+DEVICE = "cpu"   # HF free-tier Spaces use CPU
+print(f"Loading model from {REPO} …")
+cfg_path = hf_hub_download(repo_id=REPO, filename="config.json")
+st_path  = hf_hub_download(repo_id=REPO, filename="model.safetensors")
+tok_path = hf_hub_download(repo_id=REPO, filename="tokenizer.json")
+cfg   = RandyGPTConfig.from_json(cfg_path)
+tok   = Tokenizer.from_json(tok_path)
+model = RandyGPT(cfg)
+model.load_state_dict(load_file(st_path, device=DEVICE))
+model.eval()
+print(f"Model ready — vocab {cfg.vocab_size}, {cfg.n_layer}L×{cfg.n_embd}D")
+# ── Inference ─────────────────────────────────────────────────────────────────
+def generate(prompt: str, max_tokens: int, temperature: float, top_p: float) -> str:
+    prompt = prompt.strip()
+    if not prompt:
+        return "(enter a prompt)"
+    ids = tok.encode(prompt)
+    if not ids:
+        return "(could not tokenize prompt)"
+    tensor = torch.tensor([ids], dtype=torch.long)
+    out    = model.generate(tensor, max_new_tokens=max_tokens,
+                            temperature=temperature, top_p=top_p)
+    full   = tok.decode(out[0].tolist())
+    return full
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
+with gr.Blocks(title="randyGPT") as demo:
+    gr.Markdown(
+        "# randyGPT\n"
+        "A GPT-style language model trained from scratch in Rust on 114 Project Gutenberg books.\n\n"
+        f"**Model:** `{REPO}` · {cfg.n_layer} layers · {cfg.n_embd}-dim · {cfg.vocab_size}-token BPE vocab"
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            prompt_box = gr.Textbox(
+                label="Prompt",
+                placeholder="Once upon a time",
+                lines=3,
+            )
+            output_box = gr.Textbox(label="Generated text", lines=10, interactive=False)
+            run_btn    = gr.Button("Generate", variant="primary")
+        with gr.Column(scale=1):
+            max_tok  = gr.Slider(20, 200, value=150, step=10,  label="Max new tokens")
+            temp     = gr.Slider(0.1, 2.0, value=0.8, step=0.05, label="Temperature")
+            topp     = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
+    run_btn.click(
+        fn=generate,
+        inputs=[prompt_box, max_tok, temp, topp],
+        outputs=output_box,
+    )
+    prompt_box.submit(
+        fn=generate,
+        inputs=[prompt_box, max_tok, temp, topp],
+        outputs=output_box,
+    )
+    gr.Examples(
+        examples=[
+            ["Once upon a time in a land far away"],
+            ["It was the best of times, it was the worst of times"],
+            ["The old man sat by the fire and"],
+            ["She looked out across the sea and wondered"],
+        ],
+        inputs=prompt_box,
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi>=0.110.0
+uvicorn>=0.29.0
+torch>=2.0.0
+safetensors>=0.4.0
+huggingface_hub>=0.20.0
+pydantic>=2.0.0

server.py ADDED Viewed

	@@ -0,0 +1,403 @@

+"""
+server.py — randyGPT OpenAI-compatible inference server
+Serves POST /v1/chat/completions and GET /v1/models on port 7860.
+Loads model weights from HuggingFace Hub at startup.
+Compatible with OpenAI SDK, OpenRouter, LangChain, etc.
+"""
+import json
+import math
+import time
+import uuid
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pathlib import Path
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.exceptions import RequestValidationError
+from pydantic import BaseModel
+from typing import List, Optional
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+# ── Inline model ──────────────────────────────────────────────────────────────
+class Cfg:
+    def __init__(self, **kw):
+        self.vocab_size = kw.get("vocab_size", 1500)
+        self.n_embd     = kw.get("n_embd",     128)
+        self.n_head     = kw.get("n_head",     4)
+        self.n_layer    = kw.get("n_layer",    8)
+        self.block_size = kw.get("block_size", 256)
+        self.head_dim   = self.n_embd // self.n_head
+        self.mlp_dim    = 4 * self.n_embd
+def rmsnorm(x, eps=1e-5):
+    return x * (x.pow(2).mean(-1, keepdim=True) + eps).rsqrt()
+class Attn(nn.Module):
+    def __init__(self, c):
+        super().__init__()
+        self.nh, self.hd = c.n_head, c.head_dim
+        self.sc = 1.0 / math.sqrt(c.head_dim)
+        self.wq = nn.Linear(c.n_embd, c.n_embd, bias=False)
+        self.wk = nn.Linear(c.n_embd, c.n_embd, bias=False)
+        self.wv = nn.Linear(c.n_embd, c.n_embd, bias=False)
+        self.wo = nn.Linear(c.n_embd, c.n_embd, bias=False)
+    def forward(self, x):
+        B, T, C = x.shape
+        q = self.wq(x).view(B, T, self.nh, self.hd).transpose(1, 2)
+        k = self.wk(x).view(B, T, self.nh, self.hd).transpose(1, 2)
+        v = self.wv(x).view(B, T, self.nh, self.hd).transpose(1, 2)
+        s = q @ k.transpose(-2, -1) * self.sc
+        s = s + torch.full((T, T), float('-inf'), device=x.device).triu(1)
+        return self.wo((F.softmax(s, dim=-1) @ v).transpose(1, 2).contiguous().view(B, T, C))
+class MLP(nn.Module):
+    def __init__(self, c):
+        super().__init__()
+        self.fc1 = nn.Linear(c.n_embd, c.mlp_dim, bias=False)
+        self.fc2 = nn.Linear(c.mlp_dim, c.n_embd, bias=False)
+    def forward(self, x):
+        return self.fc2(F.relu(self.fc1(x)).pow(2))
+class Block(nn.Module):
+    def __init__(self, c):
+        super().__init__()
+        self.attn = Attn(c)
+        self.mlp  = MLP(c)
+    def forward(self, x):
+        x = x + self.attn(rmsnorm(x))
+        return x + self.mlp(rmsnorm(x))
+class RandyGPT(nn.Module):
+    def __init__(self, c):
+        super().__init__()
+        self.c       = c
+        self.wte     = nn.Embedding(c.vocab_size, c.n_embd)
+        self.wpe     = nn.Embedding(c.block_size, c.n_embd)
+        self.layers  = nn.ModuleList([Block(c) for _ in range(c.n_layer)])
+        self.lm_head = nn.Linear(c.n_embd, c.vocab_size, bias=False)
+    def forward(self, ids):
+        B, T = ids.shape
+        x = self.wte(ids) + self.wpe(torch.arange(T, device=ids.device).unsqueeze(0))
+        for b in self.layers:
+            x = b(x)
+        return self.lm_head(x)
+    @torch.no_grad()
+    def generate(self, ids, max_new_tokens=200, temperature=0.8, top_p=0.9):
+        self.eval()
+        for _ in range(max_new_tokens):
+            ctx    = ids[:, -self.c.block_size:]
+            logits = self(ctx)[:, -1, :] / max(temperature, 1e-6)
+            probs  = F.softmax(logits, dim=-1)
+            sp, si = torch.sort(probs, descending=True)
+            cum    = sp.cumsum(-1)
+            sp[cum - sp > top_p] = 0.0
+            sp /= sp.sum()
+            nxt = si[0, torch.multinomial(sp[0], 1)]
+            ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
+            if nxt.item() == 1:
+                break
+        return ids
+    @torch.no_grad()
+    def generate_stream(self, ids, max_new_tokens=200, temperature=0.8, top_p=0.9):
+        """Yields (token_id, is_last) one token at a time."""
+        self.eval()
+        for i in range(max_new_tokens):
+            ctx    = ids[:, -self.c.block_size:]
+            logits = self(ctx)[:, -1, :] / max(temperature, 1e-6)
+            probs  = F.softmax(logits, dim=-1)
+            sp, si = torch.sort(probs, descending=True)
+            cum    = sp.cumsum(-1)
+            sp[cum - sp > top_p] = 0.0
+            sp /= sp.sum()
+            nxt = si[0, torch.multinomial(sp[0], 1)]
+            ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
+            token_id = nxt.item()
+            is_last  = (token_id == 1) or (i == max_new_tokens - 1)
+            yield token_id, is_last
+            if token_id == 1:
+                break
+# ── Tokenizer ─────────────────────────────────────────────────────────────────
+class Tokenizer:
+    def __init__(self, vocab, merges):
+        self.vocab = vocab
+        self.t2i   = {s: i for i, s in enumerate(vocab)}
+        self.bos   = self.t2i.get("<|bos|>", 0)
+        self.eos   = self.t2i.get("<|eos|>", 1)
+        self.mmap  = {}
+        for l, r in merges:
+            li, ri, mi = self.t2i.get(l), self.t2i.get(r), self.t2i.get(l + r)
+            if li is not None and ri is not None and mi is not None:
+                self.mmap.setdefault((li, ri), mi)
+    @classmethod
+    def from_json(cls, path):
+        d = json.loads(Path(path).read_text(encoding="utf-8"))
+        return cls(d["vocab"], [tuple(m) for m in d["merges"]])
+    def _chunk(self, text):
+        tokens = [self.t2i[c] for c in text if c in self.t2i]
+        if len(tokens) < 2:
+            return tokens
+        while True:
+            best = None
+            for i in range(len(tokens) - 1):
+                m = self.mmap.get((tokens[i], tokens[i+1]))
+                if m is not None and (best is None or m < best):
+                    best = m
+            if best is None:
+                break
+            out, i = [], 0
+            while i < len(tokens):
+                if i+1 < len(tokens) and self.mmap.get((tokens[i], tokens[i+1])) == best:
+                    out.append(best); i += 2
+                else:
+                    out.append(tokens[i]); i += 1
+            tokens = out
+        return tokens
+    def encode(self, text):
+        nl = self.t2i.get("\n")
+        lines, result = text.split("\n"), []
+        for i, line in enumerate(lines):
+            result.extend(self._chunk(line))
+            if i < len(lines) - 1 and nl is not None:
+                result.append(nl)
+        return result
+    def decode(self, ids):
+        return "".join(self.vocab[i] for i in ids
+                       if i not in (self.bos, self.eos) and 0 <= i < len(self.vocab))
+# ── Load model at startup ──────────────────────────────────────────────────────
+import os
+import threading
+REPO     = os.environ.get("MODEL_REPO", "MonumentalSystems/randygpt-s")
+MODEL_ID = REPO.split("/")[-1]
+_model_lock    = threading.Lock()
+_reload_lock   = threading.Lock()   # only one reload at a time
+_is_reloading  = False              # debounce flag
+def _get_remote_sha() -> str:
+    """Fetch the current commit SHA of model.safetensors from Hub metadata."""
+    from huggingface_hub import get_paths_info
+    infos = list(get_paths_info(REPO, ["model.safetensors"], repo_type="model"))
+    return infos[0].lfs.sha256 if infos and infos[0].lfs else ""
+def load_model(force_weights=False):
+    print(f"Loading {REPO} …")
+    cfg_path = hf_hub_download(repo_id=REPO, filename="config.json",        force_download=False)
+    st_path  = hf_hub_download(repo_id=REPO, filename="model.safetensors",  force_download=force_weights)
+    tok_path = hf_hub_download(repo_id=REPO, filename="tokenizer.json",     force_download=False)
+    _cfg  = Cfg(**json.loads(Path(cfg_path).read_text()))
+    _tok  = Tokenizer.from_json(tok_path)
+    _mdl  = RandyGPT(_cfg)
+    _mdl.load_state_dict(load_file(st_path, device="cpu"))
+    _mdl.eval()
+    print("Model ready.")
+    return _cfg, _tok, _mdl
+cfg, tok, model = load_model()
+_current_sha = _get_remote_sha()
+# ── FastAPI app ────────────────────────────────────────────────────────────────
+app = FastAPI(title="randyGPT", version="0.9.6")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+def _openai_error(status: int, message: str, err_type: str = "invalid_request_error", code: str = None):
+    body = {"error": {"message": message, "type": err_type}}
+    if code:
+        body["error"]["code"] = code
+    return JSONResponse(status_code=status, content=body)
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException):
+    return _openai_error(exc.status_code, str(exc.detail))
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError):
+    msg = "; ".join(f"{e['loc'][-1]}: {e['msg']}" for e in exc.errors())
+    return _openai_error(422, msg, code="invalid_request_error")
+@app.get("/v1/models")
+def list_models():
+    return {
+        "object": "list",
+        "data": [{
+            "id":       MODEL_ID,
+            "object":   "model",
+            "created":  1700000000,
+            "owned_by": "MonumentalSystems",
+        }]
+    }
+class Message(BaseModel):
+    role: str
+    content: str
+class ChatRequest(BaseModel):
+    model:       Optional[str]   = MODEL_ID
+    messages:    List[Message]
+    max_tokens:  Optional[int]   = 200
+    temperature: Optional[float] = 0.8
+    top_p:       Optional[float] = 0.9
+    n:           Optional[int]   = 1
+    stream:      Optional[bool]  = False
+def _sse(data: dict) -> str:
+    return f"data: {json.dumps(data)}\n\n"
+def _stream_completion(ids, max_tokens, temperature, top_p, completion_id, _model, _tok):
+    """Generator that yields SSE chunks one token at a time.
+    Takes model/tok as arguments (snapshotted at request time) so reloads
+    mid-stream don't affect this request."""
+    tensor      = torch.tensor([ids], dtype=torch.long)
+    token_count = 0
+    for token_id, is_last in _model.generate_stream(
+        tensor, max_new_tokens=max_tokens,
+        temperature=temperature, top_p=top_p
+    ):
+        token_text    = _tok.decode([token_id])
+        token_count  += 1
+        finish_reason = ("length" if token_count >= max_tokens else "stop") if is_last else None
+        chunk = {
+            "id":      completion_id,
+            "object":  "chat.completion.chunk",
+            "created": int(time.time()),
+            "model":   MODEL_ID,
+            "choices": [{
+                "index": 0,
+                "delta": {"content": token_text},
+                "finish_reason": finish_reason,
+            }],
+        }
+        yield _sse(chunk)
+    yield "data: [DONE]\n\n"
+@app.post("/v1/chat/completions")
+def chat_completions(req: ChatRequest):
+    # Snapshot globals at request start — concurrent requests and reloads
+    # are both safe because each request holds its own references.
+    _m, _t, _c = model, tok, cfg
+    prompt = req.messages[-1].content.strip() if req.messages else ""
+    if not prompt:
+        raise HTTPException(status_code=400, detail="No content in messages")
+    ids = _t.encode(prompt)
+    if not ids:
+        raise HTTPException(status_code=400, detail="Prompt tokenized to empty sequence")
+    max_tokens    = max(1, min(req.max_tokens or 200, _c.block_size))
+    temperature   = max(0.01, min(req.temperature or 0.8, 2.0))
+    top_p         = req.top_p or 0.9
+    n             = max(1, min(req.n or 1, 4))
+    completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
+    # ── Streaming ─────────────────────────────────────────────────────────────
+    if req.stream:
+        return StreamingResponse(
+            _stream_completion(ids, max_tokens, temperature, top_p, completion_id, _m, _t),
+            media_type="text/event-stream",
+            headers={"X-Accel-Buffering": "no"},
+        )
+    # ── Non-streaming ─────────────────────────────────────────────────────────
+    choices = []
+    total_completion_tokens = 0
+    for i in range(n):
+        tensor      = torch.tensor([ids], dtype=torch.long)
+        out         = _m.generate(tensor, max_new_tokens=max_tokens,
+                                  temperature=temperature, top_p=top_p)
+        full        = _t.decode(out[0].tolist())
+        completion  = full[len(prompt):].lstrip() if full.startswith(prompt) else full
+        comp_tokens = len(_t.encode(completion))
+        total_completion_tokens += comp_tokens
+        choices.append({
+            "index":         i,
+            "message":       {"role": "assistant", "content": completion},
+            "finish_reason": "length" if comp_tokens >= max_tokens else "stop",
+        })
+    return {
+        "id":                 completion_id,
+        "object":             "chat.completion",
+        "created":            int(time.time()),
+        "model":              MODEL_ID,
+        "system_fingerprint": f"{MODEL_ID}-v0.9.6",
+        "choices":            choices,
+        "usage": {
+            "prompt_tokens":     len(ids),
+            "completion_tokens": total_completion_tokens,
+            "total_tokens":      len(ids) + total_completion_tokens,
+        },
+    }
+@app.post("/reload")
+def reload_weights():
+    """Hot-reload model weights from Hub. Debounced — returns 200 immediately if already reloading.
+    Only swaps weights if Hub has a newer version of model.safetensors."""
+    global cfg, tok, model, _current_sha, _is_reloading
+    # Debounce: if already reloading, return immediately
+    if _is_reloading:
+        return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "already reloading"}
+    with _reload_lock:
+        if _is_reloading:
+            return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "already reloading"}
+        _is_reloading = True
+    try:
+        new_sha = _get_remote_sha()
+        if new_sha == _current_sha:
+            return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "weights unchanged"}
+        print(f"New weights detected ({_current_sha[:8]} → {new_sha[:8]}), reloading…")
+        new_cfg, new_tok, new_model = load_model(force_weights=True)
+        with _model_lock:
+            cfg, tok, model = new_cfg, new_tok, new_model
+            _current_sha = new_sha
+        return {"status": "ok", "model": MODEL_ID, "reloaded": True, "sha": new_sha[:16]}
+    finally:
+        _is_reloading = False
+@app.get("/")
+def root():
+    return {"model": MODEL_ID, "endpoints": ["/v1/models", "/v1/chat/completions", "/reload"]}