"""Gradio chat demo for Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled. Runs on HF Spaces ZeroGPU. Model is loaded in 4-bit (bitsandbytes NF4) to fit in the ZeroGPU tier's memory budget; the base Qwen3.6-35B-A3B activates only ~3B parameters per forward, so quantization cost on quality is small. The distilled model produces ... chain-of-thought before the final answer. We surface that transparently: thinking is shown in a collapsed
block, final answer is the body of the chat message. This matches how Claude (the teacher) presents its reasoning. """ from __future__ import annotations import os import re import threading import gradio as gr import spaces # HF ZeroGPU shim — safe to import on CPU import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, ) MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled" MAX_NEW_TOKENS = 8192 # room for ~thinking + answer; long problems may truncate GEN_DURATION_SECONDS = 180 # ZeroGPU attach budget per call DESCRIPTION = """\ # Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled **A 35B-parameter MoE (with only ~3B active per token) fine-tuned to imitate the chain-of-thought style of Claude Opus 4.7.** The model thinks in explicit `` blocks before producing the final answer, same as frontier reasoning systems. > Running in 4-bit (NF4) on ZeroGPU. First message per session may pause a few seconds while the GPU attaches. Long reasoning can take 30–60s — the model genuinely uses thousands of tokens of thinking on hard problems. Model: [lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled](https://huggingface.co/lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled) """ EXAMPLES = [ ["How many positive integers less than 1000 have digits that sum to 20?"], ["Prove that for any positive integer n, the sum 1 + 2 + ... + n equals n(n+1)/2."], ["A snail climbs a 10m wall, going up 3m during the day and slipping 2m at night. How many days to reach the top?"], ["Explain, at a graduate level, why photosynthesis requires light of specific wavelengths."], ["Write a Python function that efficiently finds the k-th smallest element in a sorted matrix."], ] # --------------------------------------------------------------------------- # Model load (happens on Space startup, once per replica). BnB 4-bit keeps # the weight footprint around ~18 GB, comfortable within ZeroGPU memory. # --------------------------------------------------------------------------- bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id model = AutoModelForCausalLM.from_pretrained( MODEL_ID, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, torch_dtype=torch.bfloat16, ) model.eval() # --------------------------------------------------------------------------- # Rendering helper: separate from the final answer so the # user sees the answer prominently and can expand the thinking to inspect it. # --------------------------------------------------------------------------- _THINK_RE = re.compile(r"]*>(.*?)\s*", flags=re.DOTALL | re.IGNORECASE) def render_response(text: str) -> str: """Convert raw model output into Markdown with a collapsible thinking section. Handles streaming partials gracefully (unclosed becomes a "Thinking…" spinner until the closing tag arrives).""" thinks = _THINK_RE.findall(text) answer = _THINK_RE.sub("", text).strip() blocks: list[str] = [] # Show any completed thinking blocks, collapsed. for i, t in enumerate(thinks, start=1): label = "Reasoning" if len(thinks) == 1 else f"Reasoning (step {i})" blocks.append( f"
💭 {label}\n\n{t.strip()}\n\n
" ) # Still-streaming thinking (opened but not yet closed) — show as spinner. open_idx = text.rfind("") if open_idx > close_idx: unclosed = text[open_idx:] # strip opening tag for display unclosed = re.sub(r"^]*>", "", unclosed, flags=re.IGNORECASE) blocks.append( "
💭 Thinking…\n\n" f"{unclosed.strip()}\n\n
" ) if answer: blocks.append(answer) return "\n\n".join(blocks) or text # --------------------------------------------------------------------------- # Generation — wrapped in @spaces.GPU so ZeroGPU attaches a GPU for the call. # --------------------------------------------------------------------------- @spaces.GPU(duration=GEN_DURATION_SECONDS) def chat(message: str, history: list[dict]): # `history` is already in OpenAI-style {"role","content"} form because # we set `type="messages"` on gr.ChatInterface below. messages: list[dict] = [] for turn in history: if turn.get("role") in ("user", "assistant") and turn.get("content"): messages.append({"role": turn["role"], "content": turn["content"]}) messages.append({"role": "user", "content": message}) prompt_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt", ).to(model.device) streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True ) gen_kwargs = dict( input_ids=prompt_ids, streamer=streamer, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.0, ) thread = threading.Thread(target=model.generate, kwargs=gen_kwargs) thread.start() partial = "" for new_text in streamer: partial += new_text yield render_response(partial) thread.join(timeout=1.0) # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- with gr.Blocks(fill_height=True, title="Qwen3.6 · Claude-4.7-Opus Distilled") as demo: gr.Markdown(DESCRIPTION) gr.ChatInterface( chat, type="messages", examples=EXAMPLES, cache_examples=False, fill_height=True, chatbot=gr.Chatbot( type="messages", render_markdown=True, sanitize_html=False, # we emit
; trusted because we control the content height=600, ), ) if __name__ == "__main__": demo.queue(max_size=32).launch()