File size: 6,955 Bytes
1b4d02f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""Gradio chat demo for Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled.

Runs on HF Spaces ZeroGPU. Model is loaded in 4-bit (bitsandbytes NF4) to fit
in the ZeroGPU tier's memory budget; the base Qwen3.6-35B-A3B activates only
~3B parameters per forward, so quantization cost on quality is small.

The distilled model produces <think>...</think> chain-of-thought before the
final answer. We surface that transparently: thinking is shown in a collapsed
<details> block, final answer is the body of the chat message. This matches
how Claude (the teacher) presents its reasoning.
"""

from __future__ import annotations

import os
import re
import threading

import gradio as gr
import spaces  # HF ZeroGPU shim — safe to import on CPU
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TextIteratorStreamer,
)

MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
MAX_NEW_TOKENS = 8192   # room for ~thinking + answer; long problems may truncate
GEN_DURATION_SECONDS = 180  # ZeroGPU attach budget per call

DESCRIPTION = """\
# Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled

**A 35B-parameter MoE (with only ~3B active per token) fine-tuned to imitate the chain-of-thought style of Claude Opus 4.7.** The model thinks in explicit `<think>…</think>` blocks before producing the final answer, same as frontier reasoning systems.

> Running in 4-bit (NF4) on ZeroGPU. First message per session may pause a few seconds while the GPU attaches. Long reasoning can take 30–60s — the model genuinely uses thousands of tokens of thinking on hard problems.

Model: [lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled](https://huggingface.co/lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled)
"""

EXAMPLES = [
    ["How many positive integers less than 1000 have digits that sum to 20?"],
    ["Prove that for any positive integer n, the sum 1 + 2 + ... + n equals n(n+1)/2."],
    ["A snail climbs a 10m wall, going up 3m during the day and slipping 2m at night. How many days to reach the top?"],
    ["Explain, at a graduate level, why photosynthesis requires light of specific wavelengths."],
    ["Write a Python function that efficiently finds the k-th smallest element in a sorted matrix."],
]

# ---------------------------------------------------------------------------
# Model load (happens on Space startup, once per replica). BnB 4-bit keeps
# the weight footprint around ~18 GB, comfortable within ZeroGPU memory.
# ---------------------------------------------------------------------------

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
model.eval()


# ---------------------------------------------------------------------------
# Rendering helper: separate <think>…</think> from the final answer so the
# user sees the answer prominently and can expand the thinking to inspect it.
# ---------------------------------------------------------------------------

_THINK_RE = re.compile(r"<think\b[^>]*>(.*?)</think>\s*", flags=re.DOTALL | re.IGNORECASE)


def render_response(text: str) -> str:
    """Convert raw model output into Markdown with a collapsible thinking
    section. Handles streaming partials gracefully (unclosed <think> becomes
    a "Thinking…" spinner until the closing tag arrives)."""
    thinks = _THINK_RE.findall(text)
    answer = _THINK_RE.sub("", text).strip()

    blocks: list[str] = []

    # Show any completed thinking blocks, collapsed.
    for i, t in enumerate(thinks, start=1):
        label = "Reasoning" if len(thinks) == 1 else f"Reasoning (step {i})"
        blocks.append(
            f"<details><summary>💭 {label}</summary>\n\n{t.strip()}\n\n</details>"
        )

    # Still-streaming thinking (opened but not yet closed) — show as spinner.
    open_idx = text.rfind("<think")
    close_idx = text.rfind("</think>")
    if open_idx > close_idx:
        unclosed = text[open_idx:]
        # strip opening tag for display
        unclosed = re.sub(r"^<think\b[^>]*>", "", unclosed, flags=re.IGNORECASE)
        blocks.append(
            "<details open><summary>💭 Thinking…</summary>\n\n"
            f"{unclosed.strip()}\n\n</details>"
        )

    if answer:
        blocks.append(answer)

    return "\n\n".join(blocks) or text


# ---------------------------------------------------------------------------
# Generation — wrapped in @spaces.GPU so ZeroGPU attaches a GPU for the call.
# ---------------------------------------------------------------------------

@spaces.GPU(duration=GEN_DURATION_SECONDS)
def chat(message: str, history: list[dict]):
    # `history` is already in OpenAI-style {"role","content"} form because
    # we set `type="messages"` on gr.ChatInterface below.
    messages: list[dict] = []
    for turn in history:
        if turn.get("role") in ("user", "assistant") and turn.get("content"):
            messages.append({"role": turn["role"], "content": turn["content"]})
    messages.append({"role": "user", "content": message})

    prompt_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)

    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )
    gen_kwargs = dict(
        input_ids=prompt_ids,
        streamer=streamer,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.0,
    )
    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    partial = ""
    for new_text in streamer:
        partial += new_text
        yield render_response(partial)

    thread.join(timeout=1.0)


# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------

with gr.Blocks(fill_height=True, title="Qwen3.6 · Claude-4.7-Opus Distilled") as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        chat,
        type="messages",
        examples=EXAMPLES,
        cache_examples=False,
        fill_height=True,
        chatbot=gr.Chatbot(
            type="messages",
            render_markdown=True,
            sanitize_html=False,  # we emit <details>; trusted because we control the content
            height=600,
        ),
    )

if __name__ == "__main__":
    demo.queue(max_size=32).launch()