Spaces:
Runtime error
Runtime error
File size: 6,955 Bytes
1b4d02f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | """Gradio chat demo for Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled.
Runs on HF Spaces ZeroGPU. Model is loaded in 4-bit (bitsandbytes NF4) to fit
in the ZeroGPU tier's memory budget; the base Qwen3.6-35B-A3B activates only
~3B parameters per forward, so quantization cost on quality is small.
The distilled model produces <think>...</think> chain-of-thought before the
final answer. We surface that transparently: thinking is shown in a collapsed
<details> block, final answer is the body of the chat message. This matches
how Claude (the teacher) presents its reasoning.
"""
from __future__ import annotations
import os
import re
import threading
import gradio as gr
import spaces # HF ZeroGPU shim — safe to import on CPU
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextIteratorStreamer,
)
MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
MAX_NEW_TOKENS = 8192 # room for ~thinking + answer; long problems may truncate
GEN_DURATION_SECONDS = 180 # ZeroGPU attach budget per call
DESCRIPTION = """\
# Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled
**A 35B-parameter MoE (with only ~3B active per token) fine-tuned to imitate the chain-of-thought style of Claude Opus 4.7.** The model thinks in explicit `<think>…</think>` blocks before producing the final answer, same as frontier reasoning systems.
> Running in 4-bit (NF4) on ZeroGPU. First message per session may pause a few seconds while the GPU attaches. Long reasoning can take 30–60s — the model genuinely uses thousands of tokens of thinking on hard problems.
Model: [lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled](https://huggingface.co/lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled)
"""
EXAMPLES = [
["How many positive integers less than 1000 have digits that sum to 20?"],
["Prove that for any positive integer n, the sum 1 + 2 + ... + n equals n(n+1)/2."],
["A snail climbs a 10m wall, going up 3m during the day and slipping 2m at night. How many days to reach the top?"],
["Explain, at a graduate level, why photosynthesis requires light of specific wavelengths."],
["Write a Python function that efficiently finds the k-th smallest element in a sorted matrix."],
]
# ---------------------------------------------------------------------------
# Model load (happens on Space startup, once per replica). BnB 4-bit keeps
# the weight footprint around ~18 GB, comfortable within ZeroGPU memory.
# ---------------------------------------------------------------------------
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
model.eval()
# ---------------------------------------------------------------------------
# Rendering helper: separate <think>…</think> from the final answer so the
# user sees the answer prominently and can expand the thinking to inspect it.
# ---------------------------------------------------------------------------
_THINK_RE = re.compile(r"<think\b[^>]*>(.*?)</think>\s*", flags=re.DOTALL | re.IGNORECASE)
def render_response(text: str) -> str:
"""Convert raw model output into Markdown with a collapsible thinking
section. Handles streaming partials gracefully (unclosed <think> becomes
a "Thinking…" spinner until the closing tag arrives)."""
thinks = _THINK_RE.findall(text)
answer = _THINK_RE.sub("", text).strip()
blocks: list[str] = []
# Show any completed thinking blocks, collapsed.
for i, t in enumerate(thinks, start=1):
label = "Reasoning" if len(thinks) == 1 else f"Reasoning (step {i})"
blocks.append(
f"<details><summary>💭 {label}</summary>\n\n{t.strip()}\n\n</details>"
)
# Still-streaming thinking (opened but not yet closed) — show as spinner.
open_idx = text.rfind("<think")
close_idx = text.rfind("</think>")
if open_idx > close_idx:
unclosed = text[open_idx:]
# strip opening tag for display
unclosed = re.sub(r"^<think\b[^>]*>", "", unclosed, flags=re.IGNORECASE)
blocks.append(
"<details open><summary>💭 Thinking…</summary>\n\n"
f"{unclosed.strip()}\n\n</details>"
)
if answer:
blocks.append(answer)
return "\n\n".join(blocks) or text
# ---------------------------------------------------------------------------
# Generation — wrapped in @spaces.GPU so ZeroGPU attaches a GPU for the call.
# ---------------------------------------------------------------------------
@spaces.GPU(duration=GEN_DURATION_SECONDS)
def chat(message: str, history: list[dict]):
# `history` is already in OpenAI-style {"role","content"} form because
# we set `type="messages"` on gr.ChatInterface below.
messages: list[dict] = []
for turn in history:
if turn.get("role") in ("user", "assistant") and turn.get("content"):
messages.append({"role": turn["role"], "content": turn["content"]})
messages.append({"role": "user", "content": message})
prompt_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
).to(model.device)
streamer = TextIteratorStreamer(
tokenizer, skip_prompt=True, skip_special_tokens=True
)
gen_kwargs = dict(
input_ids=prompt_ids,
streamer=streamer,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.0,
)
thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
thread.start()
partial = ""
for new_text in streamer:
partial += new_text
yield render_response(partial)
thread.join(timeout=1.0)
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
with gr.Blocks(fill_height=True, title="Qwen3.6 · Claude-4.7-Opus Distilled") as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
chat,
type="messages",
examples=EXAMPLES,
cache_examples=False,
fill_height=True,
chatbot=gr.Chatbot(
type="messages",
render_markdown=True,
sanitize_html=False, # we emit <details>; trusted because we control the content
height=600,
),
)
if __name__ == "__main__":
demo.queue(max_size=32).launch()
|