Spaces:
Runtime error
Runtime error
File size: 8,106 Bytes
1b4d02f 8b84e8e f08cae3 1b4d02f 8b84e8e 1b4d02f f08cae3 1b4d02f f08cae3 1b4d02f f08cae3 1b4d02f f08cae3 1b4d02f 46bec1e 1b4d02f 46bec1e 1b4d02f f08cae3 1b4d02f 46bec1e 1b4d02f f08cae3 1b4d02f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | """Gradio chat demo for Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled.
Runs on HF Spaces ZeroGPU. Model is loaded in 4-bit (bitsandbytes NF4) to fit
in the ZeroGPU tier's memory budget; the base Qwen3.6-35B-A3B activates only
~3B parameters per forward, so quantization cost on quality is small.
The distilled model produces <think>...</think> chain-of-thought before the
final answer. We surface that transparently: thinking is shown in a collapsed
<details> block, final answer is the body of the chat message. This matches
how Claude (the teacher) presents its reasoning.
"""
from __future__ import annotations
import os
import re
import threading
import gradio as gr
import spaces # HF ZeroGPU shim — safe to import on CPU
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextIteratorStreamer,
)
MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
MAX_NEW_TOKENS = 2048 # demo latency cap; raise to 8192 for full reasoning
# First call includes lazy model load (~2 min to 4-bit on GPU); subsequent
# calls just generate. ZeroGPU keeps the loaded model resident across calls.
GEN_DURATION_SECONDS = 300
DESCRIPTION = """\
# Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled
**A 35B-parameter MoE (with only ~3B active per token) fine-tuned to imitate the chain-of-thought style of Claude Opus 4.7.** The model thinks in explicit `<think>…</think>` blocks before producing the final answer, same as frontier reasoning systems.
> Running in 4-bit (NF4) on ZeroGPU. **First message of a session may take 2–3 minutes** (lazy model load of ~70GB on GPU). **Subsequent messages take 30–90 seconds** because the model genuinely emits thousands of thinking tokens — that's the reasoning distillation working as intended, not a bug.
>
> Responses capped at 2048 tokens for demo latency. For full-length reasoning, run the model locally with vLLM at 64k context.
Model: [lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled](https://huggingface.co/lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled)
"""
EXAMPLES = [
["How many positive integers less than 1000 have digits that sum to 20?"],
["Prove that for any positive integer n, the sum 1 + 2 + ... + n equals n(n+1)/2."],
["A snail climbs a 10m wall, going up 3m during the day and slipping 2m at night. How many days to reach the top?"],
["Explain, at a graduate level, why photosynthesis requires light of specific wavelengths."],
["Write a Python function that efficiently finds the k-th smallest element in a sorted matrix."],
]
# ---------------------------------------------------------------------------
# Lazy model load. ZeroGPU only attaches a GPU inside @spaces.GPU functions;
# loading at module-import time would run on CPU-only, and bnb-4bit refuses
# to offload any modules to CPU. So we defer the load until the first chat
# call and keep module-level references; subsequent calls reuse the loaded
# model (ZeroGPU keeps the process + its GPU state across requests).
# ---------------------------------------------------------------------------
_model = None
_tokenizer = None
def _ensure_model_loaded() -> None:
"""Load weights on the currently-attached GPU. MUST only be called from
inside a @spaces.GPU-decorated function."""
global _model, _tokenizer
if _model is not None:
return
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if _tokenizer.pad_token_id is None:
_tokenizer.pad_token_id = _tokenizer.eos_token_id
# device_map="cuda" forces all layers onto the single attached GPU, no
# CPU/disk offload. bnb-4bit only supports this mode.
_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
quantization_config=bnb_config,
device_map="cuda",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
_model.eval()
# ---------------------------------------------------------------------------
# Rendering helper: separate <think>…</think> from the final answer so the
# user sees the answer prominently and can expand the thinking to inspect it.
# ---------------------------------------------------------------------------
_THINK_RE = re.compile(r"<think\b[^>]*>(.*?)</think>\s*", flags=re.DOTALL | re.IGNORECASE)
def render_response(text: str) -> str:
"""Convert raw model output into Markdown with a collapsible thinking
section. Handles streaming partials gracefully (unclosed <think> becomes
a "Thinking…" spinner until the closing tag arrives)."""
thinks = _THINK_RE.findall(text)
answer = _THINK_RE.sub("", text).strip()
blocks: list[str] = []
# Show any completed thinking blocks, collapsed.
for i, t in enumerate(thinks, start=1):
label = "Reasoning" if len(thinks) == 1 else f"Reasoning (step {i})"
blocks.append(
f"<details><summary>💭 {label}</summary>\n\n{t.strip()}\n\n</details>"
)
# Still-streaming thinking (opened but not yet closed) — show as spinner.
open_idx = text.rfind("<think")
close_idx = text.rfind("</think>")
if open_idx > close_idx:
unclosed = text[open_idx:]
# strip opening tag for display
unclosed = re.sub(r"^<think\b[^>]*>", "", unclosed, flags=re.IGNORECASE)
blocks.append(
"<details open><summary>💭 Thinking…</summary>\n\n"
f"{unclosed.strip()}\n\n</details>"
)
if answer:
blocks.append(answer)
return "\n\n".join(blocks) or text
# ---------------------------------------------------------------------------
# Generation — wrapped in @spaces.GPU so ZeroGPU attaches a GPU for the call.
# ---------------------------------------------------------------------------
@spaces.GPU(duration=GEN_DURATION_SECONDS)
def chat(message: str, history: list[dict]):
# Load weights on first call (on the attached GPU); no-op after that.
_ensure_model_loaded()
# `history` is already in OpenAI-style {"role","content"} form because
# we set `type="messages"` on gr.ChatInterface below.
messages: list[dict] = []
for turn in history:
if turn.get("role") in ("user", "assistant") and turn.get("content"):
messages.append({"role": turn["role"], "content": turn["content"]})
messages.append({"role": "user", "content": message})
# return_dict=True so we get a BatchEncoding with input_ids + attention_mask;
# spreading with ** into generate() is correct both with and without mask.
inputs = _tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
)
inputs = {
k: (v.to(_model.device) if hasattr(v, "to") else v)
for k, v in inputs.items()
}
streamer = TextIteratorStreamer(
_tokenizer, skip_prompt=True, skip_special_tokens=True
)
gen_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.0,
)
thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
thread.start()
partial = ""
for new_text in streamer:
partial += new_text
yield render_response(partial)
thread.join(timeout=1.0)
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
with gr.Blocks(fill_height=True, title="Qwen3.6 · Claude-4.7-Opus Distilled") as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
chat,
examples=EXAMPLES,
cache_examples=False,
fill_height=True,
)
if __name__ == "__main__":
demo.queue(max_size=32).launch()
|