lordx64 commited on
Commit
1b4d02f
·
verified ·
1 Parent(s): 638f112

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +183 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio chat demo for Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled.
2
+
3
+ Runs on HF Spaces ZeroGPU. Model is loaded in 4-bit (bitsandbytes NF4) to fit
4
+ in the ZeroGPU tier's memory budget; the base Qwen3.6-35B-A3B activates only
5
+ ~3B parameters per forward, so quantization cost on quality is small.
6
+
7
+ The distilled model produces <think>...</think> chain-of-thought before the
8
+ final answer. We surface that transparently: thinking is shown in a collapsed
9
+ <details> block, final answer is the body of the chat message. This matches
10
+ how Claude (the teacher) presents its reasoning.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import os
16
+ import re
17
+ import threading
18
+
19
+ import gradio as gr
20
+ import spaces # HF ZeroGPU shim — safe to import on CPU
21
+ import torch
22
+ from transformers import (
23
+ AutoModelForCausalLM,
24
+ AutoTokenizer,
25
+ BitsAndBytesConfig,
26
+ TextIteratorStreamer,
27
+ )
28
+
29
+ MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
30
+ MAX_NEW_TOKENS = 8192 # room for ~thinking + answer; long problems may truncate
31
+ GEN_DURATION_SECONDS = 180 # ZeroGPU attach budget per call
32
+
33
+ DESCRIPTION = """\
34
+ # Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled
35
+
36
+ **A 35B-parameter MoE (with only ~3B active per token) fine-tuned to imitate the chain-of-thought style of Claude Opus 4.7.** The model thinks in explicit `<think>…</think>` blocks before producing the final answer, same as frontier reasoning systems.
37
+
38
+ > Running in 4-bit (NF4) on ZeroGPU. First message per session may pause a few seconds while the GPU attaches. Long reasoning can take 30–60s — the model genuinely uses thousands of tokens of thinking on hard problems.
39
+
40
+ Model: [lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled](https://huggingface.co/lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled)
41
+ """
42
+
43
+ EXAMPLES = [
44
+ ["How many positive integers less than 1000 have digits that sum to 20?"],
45
+ ["Prove that for any positive integer n, the sum 1 + 2 + ... + n equals n(n+1)/2."],
46
+ ["A snail climbs a 10m wall, going up 3m during the day and slipping 2m at night. How many days to reach the top?"],
47
+ ["Explain, at a graduate level, why photosynthesis requires light of specific wavelengths."],
48
+ ["Write a Python function that efficiently finds the k-th smallest element in a sorted matrix."],
49
+ ]
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Model load (happens on Space startup, once per replica). BnB 4-bit keeps
53
+ # the weight footprint around ~18 GB, comfortable within ZeroGPU memory.
54
+ # ---------------------------------------------------------------------------
55
+
56
+ bnb_config = BitsAndBytesConfig(
57
+ load_in_4bit=True,
58
+ bnb_4bit_compute_dtype=torch.bfloat16,
59
+ bnb_4bit_use_double_quant=True,
60
+ bnb_4bit_quant_type="nf4",
61
+ )
62
+
63
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
64
+ if tokenizer.pad_token_id is None:
65
+ tokenizer.pad_token_id = tokenizer.eos_token_id
66
+
67
+ model = AutoModelForCausalLM.from_pretrained(
68
+ MODEL_ID,
69
+ quantization_config=bnb_config,
70
+ device_map="auto",
71
+ trust_remote_code=True,
72
+ torch_dtype=torch.bfloat16,
73
+ )
74
+ model.eval()
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # Rendering helper: separate <think>…</think> from the final answer so the
79
+ # user sees the answer prominently and can expand the thinking to inspect it.
80
+ # ---------------------------------------------------------------------------
81
+
82
+ _THINK_RE = re.compile(r"<think\b[^>]*>(.*?)</think>\s*", flags=re.DOTALL | re.IGNORECASE)
83
+
84
+
85
+ def render_response(text: str) -> str:
86
+ """Convert raw model output into Markdown with a collapsible thinking
87
+ section. Handles streaming partials gracefully (unclosed <think> becomes
88
+ a "Thinking…" spinner until the closing tag arrives)."""
89
+ thinks = _THINK_RE.findall(text)
90
+ answer = _THINK_RE.sub("", text).strip()
91
+
92
+ blocks: list[str] = []
93
+
94
+ # Show any completed thinking blocks, collapsed.
95
+ for i, t in enumerate(thinks, start=1):
96
+ label = "Reasoning" if len(thinks) == 1 else f"Reasoning (step {i})"
97
+ blocks.append(
98
+ f"<details><summary>💭 {label}</summary>\n\n{t.strip()}\n\n</details>"
99
+ )
100
+
101
+ # Still-streaming thinking (opened but not yet closed) — show as spinner.
102
+ open_idx = text.rfind("<think")
103
+ close_idx = text.rfind("</think>")
104
+ if open_idx > close_idx:
105
+ unclosed = text[open_idx:]
106
+ # strip opening tag for display
107
+ unclosed = re.sub(r"^<think\b[^>]*>", "", unclosed, flags=re.IGNORECASE)
108
+ blocks.append(
109
+ "<details open><summary>💭 Thinking…</summary>\n\n"
110
+ f"{unclosed.strip()}\n\n</details>"
111
+ )
112
+
113
+ if answer:
114
+ blocks.append(answer)
115
+
116
+ return "\n\n".join(blocks) or text
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # Generation — wrapped in @spaces.GPU so ZeroGPU attaches a GPU for the call.
121
+ # ---------------------------------------------------------------------------
122
+
123
+ @spaces.GPU(duration=GEN_DURATION_SECONDS)
124
+ def chat(message: str, history: list[dict]):
125
+ # `history` is already in OpenAI-style {"role","content"} form because
126
+ # we set `type="messages"` on gr.ChatInterface below.
127
+ messages: list[dict] = []
128
+ for turn in history:
129
+ if turn.get("role") in ("user", "assistant") and turn.get("content"):
130
+ messages.append({"role": turn["role"], "content": turn["content"]})
131
+ messages.append({"role": "user", "content": message})
132
+
133
+ prompt_ids = tokenizer.apply_chat_template(
134
+ messages,
135
+ add_generation_prompt=True,
136
+ return_tensors="pt",
137
+ ).to(model.device)
138
+
139
+ streamer = TextIteratorStreamer(
140
+ tokenizer, skip_prompt=True, skip_special_tokens=True
141
+ )
142
+ gen_kwargs = dict(
143
+ input_ids=prompt_ids,
144
+ streamer=streamer,
145
+ max_new_tokens=MAX_NEW_TOKENS,
146
+ do_sample=True,
147
+ temperature=0.7,
148
+ top_p=0.9,
149
+ repetition_penalty=1.0,
150
+ )
151
+ thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
152
+ thread.start()
153
+
154
+ partial = ""
155
+ for new_text in streamer:
156
+ partial += new_text
157
+ yield render_response(partial)
158
+
159
+ thread.join(timeout=1.0)
160
+
161
+
162
+ # ---------------------------------------------------------------------------
163
+ # UI
164
+ # ---------------------------------------------------------------------------
165
+
166
+ with gr.Blocks(fill_height=True, title="Qwen3.6 · Claude-4.7-Opus Distilled") as demo:
167
+ gr.Markdown(DESCRIPTION)
168
+ gr.ChatInterface(
169
+ chat,
170
+ type="messages",
171
+ examples=EXAMPLES,
172
+ cache_examples=False,
173
+ fill_height=True,
174
+ chatbot=gr.Chatbot(
175
+ type="messages",
176
+ render_markdown=True,
177
+ sanitize_html=False, # we emit <details>; trusted because we control the content
178
+ height=600,
179
+ ),
180
+ )
181
+
182
+ if __name__ == "__main__":
183
+ demo.queue(max_size=32).launch()