rsafier commited on
Commit
a1df84d
Β·
verified Β·
1 Parent(s): b0ac406

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. Dockerfile +17 -0
  2. README.md +45 -4
  3. app.py +254 -0
  4. requirements.txt +6 -0
  5. server.py +403 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # HuggingFace Spaces requires user ID 1000
4
+ RUN useradd -m -u 1000 user
5
+
6
+ WORKDIR /home/user/app
7
+ COPY --chown=user requirements.txt .
8
+ RUN pip install --no-cache-dir -r requirements.txt
9
+
10
+ COPY --chown=user server.py .
11
+
12
+ USER user
13
+ ENV HOME=/home/user
14
+
15
+ EXPOSE 7860
16
+
17
+ CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,51 @@
1
  ---
2
- title: Randygpt S Space
3
- emoji: ⚑
4
  colorFrom: blue
5
- colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: randyGPT
3
+ emoji: πŸ“–
4
  colorFrom: blue
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
+ # randyGPT β€” OpenAI-compatible API
11
+
12
+ A GPT trained from scratch in Rust on 114 Project Gutenberg books.
13
+ Model weights load from [MonumentalSystems/randygpt-s](https://huggingface.co/MonumentalSystems/randygpt-s).
14
+
15
+ ## Endpoints
16
+
17
+ | Method | Path | Description |
18
+ |--------|------|-------------|
19
+ | GET | `/v1/models` | List available models |
20
+ | POST | `/v1/chat/completions` | Generate text (OpenAI-compatible) |
21
+
22
+ ## Usage
23
+
24
+ ```bash
25
+ curl https://monumentalsystems-randygpt-space.hf.space/v1/chat/completions \
26
+ -H 'Content-Type: application/json' \
27
+ -d '{
28
+ "model": "randygpt-s",
29
+ "messages": [{"role": "user", "content": "Once upon a time"}],
30
+ "max_tokens": 200,
31
+ "temperature": 0.8
32
+ }'
33
+ ```
34
+
35
+ ### OpenAI SDK
36
+
37
+ ```python
38
+ from openai import OpenAI
39
+
40
+ client = OpenAI(
41
+ base_url="https://monumentalsystems-randygpt-space.hf.space/v1",
42
+ api_key="none",
43
+ )
44
+
45
+ response = client.chat.completions.create(
46
+ model="randygpt-s",
47
+ messages=[{"role": "user", "content": "Once upon a time"}],
48
+ max_tokens=200,
49
+ )
50
+ print(response.choices[0].message.content)
51
+ ```
app.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py β€” randyGPT HuggingFace Space
3
+ Loads model weights from the Hub; HF hosts the compute.
4
+
5
+ Repo: MonumentalSystems/randygpt-s
6
+ """
7
+
8
+ import json
9
+ import math
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ import gradio as gr
14
+ from pathlib import Path
15
+ from huggingface_hub import hf_hub_download
16
+ from safetensors.torch import load_file
17
+
18
+ # ── Inline model definition (no external import needed in the Space) ──────────
19
+
20
+ class RandyGPTConfig:
21
+ def __init__(self, **kw):
22
+ self.vocab_size = kw.get("vocab_size", 1500)
23
+ self.n_embd = kw.get("n_embd", 128)
24
+ self.n_head = kw.get("n_head", 4)
25
+ self.n_layer = kw.get("n_layer", 8)
26
+ self.block_size = kw.get("block_size", 256)
27
+ self.head_dim = self.n_embd // self.n_head
28
+ self.mlp_dim = 4 * self.n_embd
29
+
30
+ @classmethod
31
+ def from_json(cls, path):
32
+ d = json.loads(Path(path).read_text())
33
+ return cls(**d)
34
+
35
+
36
+ def rmsnorm(x, eps=1e-5):
37
+ return x * (x.pow(2).mean(-1, keepdim=True) + eps).rsqrt()
38
+
39
+
40
+ class CausalSelfAttention(nn.Module):
41
+ def __init__(self, cfg):
42
+ super().__init__()
43
+ self.n_head = cfg.n_head
44
+ self.head_dim = cfg.head_dim
45
+ self.scale = 1.0 / math.sqrt(cfg.head_dim)
46
+ self.wq = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
47
+ self.wk = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
48
+ self.wv = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
49
+ self.wo = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
50
+
51
+ def forward(self, x):
52
+ B, T, C = x.shape
53
+ q = self.wq(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
54
+ k = self.wk(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
55
+ v = self.wv(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
56
+ scores = q @ k.transpose(-2, -1) * self.scale
57
+ mask = torch.full((T, T), float('-inf'), device=x.device).triu(1)
58
+ attn = F.softmax(scores + mask, dim=-1)
59
+ out = (attn @ v).transpose(1, 2).contiguous().view(B, T, C)
60
+ return self.wo(out)
61
+
62
+
63
+ class MLP(nn.Module):
64
+ def __init__(self, cfg):
65
+ super().__init__()
66
+ self.fc1 = nn.Linear(cfg.n_embd, cfg.mlp_dim, bias=False)
67
+ self.fc2 = nn.Linear(cfg.mlp_dim, cfg.n_embd, bias=False)
68
+
69
+ def forward(self, x):
70
+ return self.fc2(F.relu(self.fc1(x)).pow(2))
71
+
72
+
73
+ class TransformerBlock(nn.Module):
74
+ def __init__(self, cfg):
75
+ super().__init__()
76
+ self.attn = CausalSelfAttention(cfg)
77
+ self.mlp = MLP(cfg)
78
+
79
+ def forward(self, x):
80
+ x = x + self.attn(rmsnorm(x))
81
+ x = x + self.mlp(rmsnorm(x))
82
+ return x
83
+
84
+
85
+ class RandyGPT(nn.Module):
86
+ def __init__(self, cfg):
87
+ super().__init__()
88
+ self.cfg = cfg
89
+ self.wte = nn.Embedding(cfg.vocab_size, cfg.n_embd)
90
+ self.wpe = nn.Embedding(cfg.block_size, cfg.n_embd)
91
+ self.layers = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layer)])
92
+ self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
93
+
94
+ def forward(self, ids):
95
+ B, T = ids.shape
96
+ pos = torch.arange(T, device=ids.device).unsqueeze(0)
97
+ x = self.wte(ids) + self.wpe(pos)
98
+ for block in self.layers:
99
+ x = block(x)
100
+ return self.lm_head(x)
101
+
102
+ @torch.no_grad()
103
+ def generate(self, ids, max_new_tokens=200, temperature=0.8, top_p=0.9):
104
+ self.eval()
105
+ for _ in range(max_new_tokens):
106
+ ctx = ids[:, -self.cfg.block_size:]
107
+ logits = self(ctx)[:, -1, :] / temperature
108
+ probs = F.softmax(logits, dim=-1)
109
+ sp, si = torch.sort(probs, descending=True)
110
+ cum = sp.cumsum(-1)
111
+ sp[cum - sp > top_p] = 0.0
112
+ sp /= sp.sum()
113
+ nxt = si[0, torch.multinomial(sp[0], 1)]
114
+ ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
115
+ if nxt.item() == 1: # <|eos|>
116
+ break
117
+ return ids
118
+
119
+
120
+ # ── Tokenizer ─────────────────────────────────────────────────────────────────
121
+
122
+ class Tokenizer:
123
+ def __init__(self, vocab, merges):
124
+ self.vocab = vocab
125
+ self.t2i = {s: i for i, s in enumerate(vocab)}
126
+ self.bos = self.t2i.get("<|bos|>", 0)
127
+ self.eos = self.t2i.get("<|eos|>", 1)
128
+ self.merge_map = {}
129
+ for left, right in merges:
130
+ l, r, m = self.t2i.get(left), self.t2i.get(right), self.t2i.get(left + right)
131
+ if l is not None and r is not None and m is not None:
132
+ self.merge_map.setdefault((l, r), m)
133
+
134
+ @classmethod
135
+ def from_json(cls, path):
136
+ d = json.loads(Path(path).read_text(encoding="utf-8"))
137
+ return cls(d["vocab"], [tuple(m) for m in d["merges"]])
138
+
139
+ def _encode_chunk(self, text):
140
+ tokens = [self.t2i[c] for c in text if c in self.t2i]
141
+ if len(tokens) < 2:
142
+ return tokens
143
+ while True:
144
+ best = None
145
+ for i in range(len(tokens) - 1):
146
+ m = self.merge_map.get((tokens[i], tokens[i+1]))
147
+ if m is not None and (best is None or m < best):
148
+ best = m
149
+ if best is None:
150
+ break
151
+ out, i = [], 0
152
+ while i < len(tokens):
153
+ if i+1 < len(tokens) and self.merge_map.get((tokens[i], tokens[i+1])) == best:
154
+ out.append(best); i += 2
155
+ else:
156
+ out.append(tokens[i]); i += 1
157
+ tokens = out
158
+ return tokens
159
+
160
+ def encode(self, text):
161
+ nl = self.t2i.get("\n")
162
+ lines, result = text.split("\n"), []
163
+ for i, line in enumerate(lines):
164
+ result.extend(self._encode_chunk(line))
165
+ if i < len(lines) - 1 and nl is not None:
166
+ result.append(nl)
167
+ return result
168
+
169
+ def decode(self, ids):
170
+ return "".join(self.vocab[i] for i in ids
171
+ if i not in (self.bos, self.eos) and 0 <= i < len(self.vocab))
172
+
173
+
174
+ # ── Load model once at startup ────────────────────────────────────────────────
175
+
176
+ REPO = "MonumentalSystems/randygpt-s"
177
+ DEVICE = "cpu" # HF free-tier Spaces use CPU
178
+
179
+ print(f"Loading model from {REPO} …")
180
+ cfg_path = hf_hub_download(repo_id=REPO, filename="config.json")
181
+ st_path = hf_hub_download(repo_id=REPO, filename="model.safetensors")
182
+ tok_path = hf_hub_download(repo_id=REPO, filename="tokenizer.json")
183
+
184
+ cfg = RandyGPTConfig.from_json(cfg_path)
185
+ tok = Tokenizer.from_json(tok_path)
186
+ model = RandyGPT(cfg)
187
+ model.load_state_dict(load_file(st_path, device=DEVICE))
188
+ model.eval()
189
+ print(f"Model ready β€” vocab {cfg.vocab_size}, {cfg.n_layer}LΓ—{cfg.n_embd}D")
190
+
191
+
192
+ # ── Inference ─────────────────────────────────────────────────────────────────
193
+
194
+ def generate(prompt: str, max_tokens: int, temperature: float, top_p: float) -> str:
195
+ prompt = prompt.strip()
196
+ if not prompt:
197
+ return "(enter a prompt)"
198
+ ids = tok.encode(prompt)
199
+ if not ids:
200
+ return "(could not tokenize prompt)"
201
+ tensor = torch.tensor([ids], dtype=torch.long)
202
+ out = model.generate(tensor, max_new_tokens=max_tokens,
203
+ temperature=temperature, top_p=top_p)
204
+ full = tok.decode(out[0].tolist())
205
+ return full
206
+
207
+
208
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
209
+
210
+ with gr.Blocks(title="randyGPT") as demo:
211
+ gr.Markdown(
212
+ "# randyGPT\n"
213
+ "A GPT-style language model trained from scratch in Rust on 114 Project Gutenberg books.\n\n"
214
+ f"**Model:** `{REPO}` Β· {cfg.n_layer} layers Β· {cfg.n_embd}-dim Β· {cfg.vocab_size}-token BPE vocab"
215
+ )
216
+
217
+ with gr.Row():
218
+ with gr.Column(scale=3):
219
+ prompt_box = gr.Textbox(
220
+ label="Prompt",
221
+ placeholder="Once upon a time",
222
+ lines=3,
223
+ )
224
+ output_box = gr.Textbox(label="Generated text", lines=10, interactive=False)
225
+ run_btn = gr.Button("Generate", variant="primary")
226
+
227
+ with gr.Column(scale=1):
228
+ max_tok = gr.Slider(20, 200, value=150, step=10, label="Max new tokens")
229
+ temp = gr.Slider(0.1, 2.0, value=0.8, step=0.05, label="Temperature")
230
+ topp = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
231
+
232
+ run_btn.click(
233
+ fn=generate,
234
+ inputs=[prompt_box, max_tok, temp, topp],
235
+ outputs=output_box,
236
+ )
237
+ prompt_box.submit(
238
+ fn=generate,
239
+ inputs=[prompt_box, max_tok, temp, topp],
240
+ outputs=output_box,
241
+ )
242
+
243
+ gr.Examples(
244
+ examples=[
245
+ ["Once upon a time in a land far away"],
246
+ ["It was the best of times, it was the worst of times"],
247
+ ["The old man sat by the fire and"],
248
+ ["She looked out across the sea and wondered"],
249
+ ],
250
+ inputs=prompt_box,
251
+ )
252
+
253
+ if __name__ == "__main__":
254
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi>=0.110.0
2
+ uvicorn>=0.29.0
3
+ torch>=2.0.0
4
+ safetensors>=0.4.0
5
+ huggingface_hub>=0.20.0
6
+ pydantic>=2.0.0
server.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server.py β€” randyGPT OpenAI-compatible inference server
3
+ Serves POST /v1/chat/completions and GET /v1/models on port 7860.
4
+
5
+ Loads model weights from HuggingFace Hub at startup.
6
+ Compatible with OpenAI SDK, OpenRouter, LangChain, etc.
7
+ """
8
+
9
+ import json
10
+ import math
11
+ import time
12
+ import uuid
13
+ import torch
14
+ import torch.nn as nn
15
+ import torch.nn.functional as F
16
+ from pathlib import Path
17
+ from fastapi import FastAPI, HTTPException, Request
18
+ from fastapi.responses import JSONResponse, StreamingResponse
19
+ from fastapi.middleware.cors import CORSMiddleware
20
+ from fastapi.exceptions import RequestValidationError
21
+ from pydantic import BaseModel
22
+ from typing import List, Optional
23
+ from huggingface_hub import hf_hub_download
24
+ from safetensors.torch import load_file
25
+
26
+ # ── Inline model ──────────────────────────────────────────────────────────────
27
+
28
+ class Cfg:
29
+ def __init__(self, **kw):
30
+ self.vocab_size = kw.get("vocab_size", 1500)
31
+ self.n_embd = kw.get("n_embd", 128)
32
+ self.n_head = kw.get("n_head", 4)
33
+ self.n_layer = kw.get("n_layer", 8)
34
+ self.block_size = kw.get("block_size", 256)
35
+ self.head_dim = self.n_embd // self.n_head
36
+ self.mlp_dim = 4 * self.n_embd
37
+
38
+ def rmsnorm(x, eps=1e-5):
39
+ return x * (x.pow(2).mean(-1, keepdim=True) + eps).rsqrt()
40
+
41
+ class Attn(nn.Module):
42
+ def __init__(self, c):
43
+ super().__init__()
44
+ self.nh, self.hd = c.n_head, c.head_dim
45
+ self.sc = 1.0 / math.sqrt(c.head_dim)
46
+ self.wq = nn.Linear(c.n_embd, c.n_embd, bias=False)
47
+ self.wk = nn.Linear(c.n_embd, c.n_embd, bias=False)
48
+ self.wv = nn.Linear(c.n_embd, c.n_embd, bias=False)
49
+ self.wo = nn.Linear(c.n_embd, c.n_embd, bias=False)
50
+ def forward(self, x):
51
+ B, T, C = x.shape
52
+ q = self.wq(x).view(B, T, self.nh, self.hd).transpose(1, 2)
53
+ k = self.wk(x).view(B, T, self.nh, self.hd).transpose(1, 2)
54
+ v = self.wv(x).view(B, T, self.nh, self.hd).transpose(1, 2)
55
+ s = q @ k.transpose(-2, -1) * self.sc
56
+ s = s + torch.full((T, T), float('-inf'), device=x.device).triu(1)
57
+ return self.wo((F.softmax(s, dim=-1) @ v).transpose(1, 2).contiguous().view(B, T, C))
58
+
59
+ class MLP(nn.Module):
60
+ def __init__(self, c):
61
+ super().__init__()
62
+ self.fc1 = nn.Linear(c.n_embd, c.mlp_dim, bias=False)
63
+ self.fc2 = nn.Linear(c.mlp_dim, c.n_embd, bias=False)
64
+ def forward(self, x):
65
+ return self.fc2(F.relu(self.fc1(x)).pow(2))
66
+
67
+ class Block(nn.Module):
68
+ def __init__(self, c):
69
+ super().__init__()
70
+ self.attn = Attn(c)
71
+ self.mlp = MLP(c)
72
+ def forward(self, x):
73
+ x = x + self.attn(rmsnorm(x))
74
+ return x + self.mlp(rmsnorm(x))
75
+
76
+ class RandyGPT(nn.Module):
77
+ def __init__(self, c):
78
+ super().__init__()
79
+ self.c = c
80
+ self.wte = nn.Embedding(c.vocab_size, c.n_embd)
81
+ self.wpe = nn.Embedding(c.block_size, c.n_embd)
82
+ self.layers = nn.ModuleList([Block(c) for _ in range(c.n_layer)])
83
+ self.lm_head = nn.Linear(c.n_embd, c.vocab_size, bias=False)
84
+
85
+ def forward(self, ids):
86
+ B, T = ids.shape
87
+ x = self.wte(ids) + self.wpe(torch.arange(T, device=ids.device).unsqueeze(0))
88
+ for b in self.layers:
89
+ x = b(x)
90
+ return self.lm_head(x)
91
+
92
+ @torch.no_grad()
93
+ def generate(self, ids, max_new_tokens=200, temperature=0.8, top_p=0.9):
94
+ self.eval()
95
+ for _ in range(max_new_tokens):
96
+ ctx = ids[:, -self.c.block_size:]
97
+ logits = self(ctx)[:, -1, :] / max(temperature, 1e-6)
98
+ probs = F.softmax(logits, dim=-1)
99
+ sp, si = torch.sort(probs, descending=True)
100
+ cum = sp.cumsum(-1)
101
+ sp[cum - sp > top_p] = 0.0
102
+ sp /= sp.sum()
103
+ nxt = si[0, torch.multinomial(sp[0], 1)]
104
+ ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
105
+ if nxt.item() == 1:
106
+ break
107
+ return ids
108
+
109
+ @torch.no_grad()
110
+ def generate_stream(self, ids, max_new_tokens=200, temperature=0.8, top_p=0.9):
111
+ """Yields (token_id, is_last) one token at a time."""
112
+ self.eval()
113
+ for i in range(max_new_tokens):
114
+ ctx = ids[:, -self.c.block_size:]
115
+ logits = self(ctx)[:, -1, :] / max(temperature, 1e-6)
116
+ probs = F.softmax(logits, dim=-1)
117
+ sp, si = torch.sort(probs, descending=True)
118
+ cum = sp.cumsum(-1)
119
+ sp[cum - sp > top_p] = 0.0
120
+ sp /= sp.sum()
121
+ nxt = si[0, torch.multinomial(sp[0], 1)]
122
+ ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
123
+ token_id = nxt.item()
124
+ is_last = (token_id == 1) or (i == max_new_tokens - 1)
125
+ yield token_id, is_last
126
+ if token_id == 1:
127
+ break
128
+
129
+
130
+ # ── Tokenizer ─────────────────────────────────────────────────────────────────
131
+
132
+ class Tokenizer:
133
+ def __init__(self, vocab, merges):
134
+ self.vocab = vocab
135
+ self.t2i = {s: i for i, s in enumerate(vocab)}
136
+ self.bos = self.t2i.get("<|bos|>", 0)
137
+ self.eos = self.t2i.get("<|eos|>", 1)
138
+ self.mmap = {}
139
+ for l, r in merges:
140
+ li, ri, mi = self.t2i.get(l), self.t2i.get(r), self.t2i.get(l + r)
141
+ if li is not None and ri is not None and mi is not None:
142
+ self.mmap.setdefault((li, ri), mi)
143
+
144
+ @classmethod
145
+ def from_json(cls, path):
146
+ d = json.loads(Path(path).read_text(encoding="utf-8"))
147
+ return cls(d["vocab"], [tuple(m) for m in d["merges"]])
148
+
149
+ def _chunk(self, text):
150
+ tokens = [self.t2i[c] for c in text if c in self.t2i]
151
+ if len(tokens) < 2:
152
+ return tokens
153
+ while True:
154
+ best = None
155
+ for i in range(len(tokens) - 1):
156
+ m = self.mmap.get((tokens[i], tokens[i+1]))
157
+ if m is not None and (best is None or m < best):
158
+ best = m
159
+ if best is None:
160
+ break
161
+ out, i = [], 0
162
+ while i < len(tokens):
163
+ if i+1 < len(tokens) and self.mmap.get((tokens[i], tokens[i+1])) == best:
164
+ out.append(best); i += 2
165
+ else:
166
+ out.append(tokens[i]); i += 1
167
+ tokens = out
168
+ return tokens
169
+
170
+ def encode(self, text):
171
+ nl = self.t2i.get("\n")
172
+ lines, result = text.split("\n"), []
173
+ for i, line in enumerate(lines):
174
+ result.extend(self._chunk(line))
175
+ if i < len(lines) - 1 and nl is not None:
176
+ result.append(nl)
177
+ return result
178
+
179
+ def decode(self, ids):
180
+ return "".join(self.vocab[i] for i in ids
181
+ if i not in (self.bos, self.eos) and 0 <= i < len(self.vocab))
182
+
183
+
184
+ # ── Load model at startup ──────────────────────────────────────────────────────
185
+
186
+ import os
187
+ import threading
188
+
189
+ REPO = os.environ.get("MODEL_REPO", "MonumentalSystems/randygpt-s")
190
+ MODEL_ID = REPO.split("/")[-1]
191
+
192
+ _model_lock = threading.Lock()
193
+ _reload_lock = threading.Lock() # only one reload at a time
194
+ _is_reloading = False # debounce flag
195
+
196
+ def _get_remote_sha() -> str:
197
+ """Fetch the current commit SHA of model.safetensors from Hub metadata."""
198
+ from huggingface_hub import get_paths_info
199
+ infos = list(get_paths_info(REPO, ["model.safetensors"], repo_type="model"))
200
+ return infos[0].lfs.sha256 if infos and infos[0].lfs else ""
201
+
202
+ def load_model(force_weights=False):
203
+ print(f"Loading {REPO} …")
204
+ cfg_path = hf_hub_download(repo_id=REPO, filename="config.json", force_download=False)
205
+ st_path = hf_hub_download(repo_id=REPO, filename="model.safetensors", force_download=force_weights)
206
+ tok_path = hf_hub_download(repo_id=REPO, filename="tokenizer.json", force_download=False)
207
+ _cfg = Cfg(**json.loads(Path(cfg_path).read_text()))
208
+ _tok = Tokenizer.from_json(tok_path)
209
+ _mdl = RandyGPT(_cfg)
210
+ _mdl.load_state_dict(load_file(st_path, device="cpu"))
211
+ _mdl.eval()
212
+ print("Model ready.")
213
+ return _cfg, _tok, _mdl
214
+
215
+ cfg, tok, model = load_model()
216
+ _current_sha = _get_remote_sha()
217
+
218
+
219
+ # ── FastAPI app ────────────────────────────────────────────────────────────────
220
+
221
+ app = FastAPI(title="randyGPT", version="0.9.6")
222
+
223
+ app.add_middleware(
224
+ CORSMiddleware,
225
+ allow_origins=["*"],
226
+ allow_methods=["*"],
227
+ allow_headers=["*"],
228
+ )
229
+
230
+ def _openai_error(status: int, message: str, err_type: str = "invalid_request_error", code: str = None):
231
+ body = {"error": {"message": message, "type": err_type}}
232
+ if code:
233
+ body["error"]["code"] = code
234
+ return JSONResponse(status_code=status, content=body)
235
+
236
+ @app.exception_handler(HTTPException)
237
+ async def http_exception_handler(request: Request, exc: HTTPException):
238
+ return _openai_error(exc.status_code, str(exc.detail))
239
+
240
+ @app.exception_handler(RequestValidationError)
241
+ async def validation_exception_handler(request: Request, exc: RequestValidationError):
242
+ msg = "; ".join(f"{e['loc'][-1]}: {e['msg']}" for e in exc.errors())
243
+ return _openai_error(422, msg, code="invalid_request_error")
244
+
245
+
246
+ @app.get("/v1/models")
247
+ def list_models():
248
+ return {
249
+ "object": "list",
250
+ "data": [{
251
+ "id": MODEL_ID,
252
+ "object": "model",
253
+ "created": 1700000000,
254
+ "owned_by": "MonumentalSystems",
255
+ }]
256
+ }
257
+
258
+
259
+ class Message(BaseModel):
260
+ role: str
261
+ content: str
262
+
263
+ class ChatRequest(BaseModel):
264
+ model: Optional[str] = MODEL_ID
265
+ messages: List[Message]
266
+ max_tokens: Optional[int] = 200
267
+ temperature: Optional[float] = 0.8
268
+ top_p: Optional[float] = 0.9
269
+ n: Optional[int] = 1
270
+ stream: Optional[bool] = False
271
+
272
+
273
+ def _sse(data: dict) -> str:
274
+ return f"data: {json.dumps(data)}\n\n"
275
+
276
+
277
+ def _stream_completion(ids, max_tokens, temperature, top_p, completion_id, _model, _tok):
278
+ """Generator that yields SSE chunks one token at a time.
279
+ Takes model/tok as arguments (snapshotted at request time) so reloads
280
+ mid-stream don't affect this request."""
281
+ tensor = torch.tensor([ids], dtype=torch.long)
282
+ token_count = 0
283
+
284
+ for token_id, is_last in _model.generate_stream(
285
+ tensor, max_new_tokens=max_tokens,
286
+ temperature=temperature, top_p=top_p
287
+ ):
288
+ token_text = _tok.decode([token_id])
289
+ token_count += 1
290
+ finish_reason = ("length" if token_count >= max_tokens else "stop") if is_last else None
291
+
292
+ chunk = {
293
+ "id": completion_id,
294
+ "object": "chat.completion.chunk",
295
+ "created": int(time.time()),
296
+ "model": MODEL_ID,
297
+ "choices": [{
298
+ "index": 0,
299
+ "delta": {"content": token_text},
300
+ "finish_reason": finish_reason,
301
+ }],
302
+ }
303
+ yield _sse(chunk)
304
+
305
+ yield "data: [DONE]\n\n"
306
+
307
+
308
+ @app.post("/v1/chat/completions")
309
+ def chat_completions(req: ChatRequest):
310
+ # Snapshot globals at request start β€” concurrent requests and reloads
311
+ # are both safe because each request holds its own references.
312
+ _m, _t, _c = model, tok, cfg
313
+
314
+ prompt = req.messages[-1].content.strip() if req.messages else ""
315
+ if not prompt:
316
+ raise HTTPException(status_code=400, detail="No content in messages")
317
+
318
+ ids = _t.encode(prompt)
319
+ if not ids:
320
+ raise HTTPException(status_code=400, detail="Prompt tokenized to empty sequence")
321
+
322
+ max_tokens = max(1, min(req.max_tokens or 200, _c.block_size))
323
+ temperature = max(0.01, min(req.temperature or 0.8, 2.0))
324
+ top_p = req.top_p or 0.9
325
+ n = max(1, min(req.n or 1, 4))
326
+ completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
327
+
328
+ # ── Streaming ─────────────────────────────────────────────────────────────
329
+ if req.stream:
330
+ return StreamingResponse(
331
+ _stream_completion(ids, max_tokens, temperature, top_p, completion_id, _m, _t),
332
+ media_type="text/event-stream",
333
+ headers={"X-Accel-Buffering": "no"},
334
+ )
335
+
336
+ # ── Non-streaming ─────────────────────────────────────────────────────────
337
+ choices = []
338
+ total_completion_tokens = 0
339
+
340
+ for i in range(n):
341
+ tensor = torch.tensor([ids], dtype=torch.long)
342
+ out = _m.generate(tensor, max_new_tokens=max_tokens,
343
+ temperature=temperature, top_p=top_p)
344
+ full = _t.decode(out[0].tolist())
345
+ completion = full[len(prompt):].lstrip() if full.startswith(prompt) else full
346
+ comp_tokens = len(_t.encode(completion))
347
+ total_completion_tokens += comp_tokens
348
+ choices.append({
349
+ "index": i,
350
+ "message": {"role": "assistant", "content": completion},
351
+ "finish_reason": "length" if comp_tokens >= max_tokens else "stop",
352
+ })
353
+
354
+ return {
355
+ "id": completion_id,
356
+ "object": "chat.completion",
357
+ "created": int(time.time()),
358
+ "model": MODEL_ID,
359
+ "system_fingerprint": f"{MODEL_ID}-v0.9.6",
360
+ "choices": choices,
361
+ "usage": {
362
+ "prompt_tokens": len(ids),
363
+ "completion_tokens": total_completion_tokens,
364
+ "total_tokens": len(ids) + total_completion_tokens,
365
+ },
366
+ }
367
+
368
+
369
+ @app.post("/reload")
370
+ def reload_weights():
371
+ """Hot-reload model weights from Hub. Debounced β€” returns 200 immediately if already reloading.
372
+ Only swaps weights if Hub has a newer version of model.safetensors."""
373
+ global cfg, tok, model, _current_sha, _is_reloading
374
+
375
+ # Debounce: if already reloading, return immediately
376
+ if _is_reloading:
377
+ return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "already reloading"}
378
+
379
+ with _reload_lock:
380
+ if _is_reloading:
381
+ return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "already reloading"}
382
+ _is_reloading = True
383
+
384
+ try:
385
+ new_sha = _get_remote_sha()
386
+ if new_sha == _current_sha:
387
+ return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "weights unchanged"}
388
+
389
+ print(f"New weights detected ({_current_sha[:8]} β†’ {new_sha[:8]}), reloading…")
390
+ new_cfg, new_tok, new_model = load_model(force_weights=True)
391
+
392
+ with _model_lock:
393
+ cfg, tok, model = new_cfg, new_tok, new_model
394
+ _current_sha = new_sha
395
+
396
+ return {"status": "ok", "model": MODEL_ID, "reloaded": True, "sha": new_sha[:16]}
397
+ finally:
398
+ _is_reloading = False
399
+
400
+
401
+ @app.get("/")
402
+ def root():
403
+ return {"model": MODEL_ID, "endpoints": ["/v1/models", "/v1/chat/completions", "/reload"]}