File size: 8,106 Bytes
1b4d02f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b84e8e
f08cae3
 
 
1b4d02f
 
 
 
 
 
8b84e8e
 
 
1b4d02f
 
 
 
 
 
 
 
 
 
 
 
 
f08cae3
 
 
 
 
1b4d02f
 
f08cae3
 
1b4d02f
 
f08cae3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b4d02f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f08cae3
 
 
1b4d02f
 
 
 
 
 
 
 
46bec1e
 
 
1b4d02f
 
 
46bec1e
 
 
 
 
 
1b4d02f
 
f08cae3
1b4d02f
 
46bec1e
1b4d02f
 
 
 
 
 
 
f08cae3
1b4d02f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""Gradio chat demo for Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled.

Runs on HF Spaces ZeroGPU. Model is loaded in 4-bit (bitsandbytes NF4) to fit
in the ZeroGPU tier's memory budget; the base Qwen3.6-35B-A3B activates only
~3B parameters per forward, so quantization cost on quality is small.

The distilled model produces <think>...</think> chain-of-thought before the
final answer. We surface that transparently: thinking is shown in a collapsed
<details> block, final answer is the body of the chat message. This matches
how Claude (the teacher) presents its reasoning.
"""

from __future__ import annotations

import os
import re
import threading

import gradio as gr
import spaces  # HF ZeroGPU shim — safe to import on CPU
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextIteratorStreamer,
)

MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
MAX_NEW_TOKENS = 2048   # demo latency cap; raise to 8192 for full reasoning
# First call includes lazy model load (~2 min to 4-bit on GPU); subsequent
# calls just generate. ZeroGPU keeps the loaded model resident across calls.
GEN_DURATION_SECONDS = 300

DESCRIPTION = """\
# Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled

**A 35B-parameter MoE (with only ~3B active per token) fine-tuned to imitate the chain-of-thought style of Claude Opus 4.7.** The model thinks in explicit `<think>…</think>` blocks before producing the final answer, same as frontier reasoning systems.

> Running in 4-bit (NF4) on ZeroGPU. **First message of a session may take 2–3 minutes** (lazy model load of ~70GB on GPU). **Subsequent messages take 30–90 seconds** because the model genuinely emits thousands of thinking tokens — that's the reasoning distillation working as intended, not a bug.
>
> Responses capped at 2048 tokens for demo latency. For full-length reasoning, run the model locally with vLLM at 64k context.

Model: [lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled](https://huggingface.co/lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled)
"""

EXAMPLES = [
    ["How many positive integers less than 1000 have digits that sum to 20?"],
    ["Prove that for any positive integer n, the sum 1 + 2 + ... + n equals n(n+1)/2."],
    ["A snail climbs a 10m wall, going up 3m during the day and slipping 2m at night. How many days to reach the top?"],
    ["Explain, at a graduate level, why photosynthesis requires light of specific wavelengths."],
    ["Write a Python function that efficiently finds the k-th smallest element in a sorted matrix."],
]

# ---------------------------------------------------------------------------
# Lazy model load. ZeroGPU only attaches a GPU inside @spaces.GPU functions;
# loading at module-import time would run on CPU-only, and bnb-4bit refuses
# to offload any modules to CPU. So we defer the load until the first chat
# call and keep module-level references; subsequent calls reuse the loaded
# model (ZeroGPU keeps the process + its GPU state across requests).
# ---------------------------------------------------------------------------

_model = None
_tokenizer = None


def _ensure_model_loaded() -> None:
    """Load weights on the currently-attached GPU. MUST only be called from
    inside a @spaces.GPU-decorated function."""
    global _model, _tokenizer
    if _model is not None:
        return

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )

    _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    if _tokenizer.pad_token_id is None:
        _tokenizer.pad_token_id = _tokenizer.eos_token_id

    # device_map="cuda" forces all layers onto the single attached GPU, no
    # CPU/disk offload. bnb-4bit only supports this mode.
    _model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="cuda",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )
    _model.eval()


# ---------------------------------------------------------------------------
# Rendering helper: separate <think>…</think> from the final answer so the
# user sees the answer prominently and can expand the thinking to inspect it.
# ---------------------------------------------------------------------------

_THINK_RE = re.compile(r"<think\b[^>]*>(.*?)</think>\s*", flags=re.DOTALL | re.IGNORECASE)


def render_response(text: str) -> str:
    """Convert raw model output into Markdown with a collapsible thinking
    section. Handles streaming partials gracefully (unclosed <think> becomes
    a "Thinking…" spinner until the closing tag arrives)."""
    thinks = _THINK_RE.findall(text)
    answer = _THINK_RE.sub("", text).strip()

    blocks: list[str] = []

    # Show any completed thinking blocks, collapsed.
    for i, t in enumerate(thinks, start=1):
        label = "Reasoning" if len(thinks) == 1 else f"Reasoning (step {i})"
        blocks.append(
            f"<details><summary>💭 {label}</summary>\n\n{t.strip()}\n\n</details>"
        )

    # Still-streaming thinking (opened but not yet closed) — show as spinner.
    open_idx = text.rfind("<think")
    close_idx = text.rfind("</think>")
    if open_idx > close_idx:
        unclosed = text[open_idx:]
        # strip opening tag for display
        unclosed = re.sub(r"^<think\b[^>]*>", "", unclosed, flags=re.IGNORECASE)
        blocks.append(
            "<details open><summary>💭 Thinking…</summary>\n\n"
            f"{unclosed.strip()}\n\n</details>"
        )

    if answer:
        blocks.append(answer)

    return "\n\n".join(blocks) or text


# ---------------------------------------------------------------------------
# Generation — wrapped in @spaces.GPU so ZeroGPU attaches a GPU for the call.
# ---------------------------------------------------------------------------

@spaces.GPU(duration=GEN_DURATION_SECONDS)
def chat(message: str, history: list[dict]):
    # Load weights on first call (on the attached GPU); no-op after that.
    _ensure_model_loaded()

    # `history` is already in OpenAI-style {"role","content"} form because
    # we set `type="messages"` on gr.ChatInterface below.
    messages: list[dict] = []
    for turn in history:
        if turn.get("role") in ("user", "assistant") and turn.get("content"):
            messages.append({"role": turn["role"], "content": turn["content"]})
    messages.append({"role": "user", "content": message})

    # return_dict=True so we get a BatchEncoding with input_ids + attention_mask;
    # spreading with ** into generate() is correct both with and without mask.
    inputs = _tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )
    inputs = {
        k: (v.to(_model.device) if hasattr(v, "to") else v)
        for k, v in inputs.items()
    }

    streamer = TextIteratorStreamer(
        _tokenizer, skip_prompt=True, skip_special_tokens=True
    )
    gen_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.0,
    )
    thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
    thread.start()

    partial = ""
    for new_text in streamer:
        partial += new_text
        yield render_response(partial)

    thread.join(timeout=1.0)


# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------

with gr.Blocks(fill_height=True, title="Qwen3.6 · Claude-4.7-Opus Distilled") as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        chat,
        examples=EXAMPLES,
        cache_examples=False,
        fill_height=True,
    )

if __name__ == "__main__":
    demo.queue(max_size=32).launch()