Spaces:
Runtime error
Runtime error
| """Gradio chat demo for Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled. | |
| Runs on HF Spaces ZeroGPU. Model is loaded in 4-bit (bitsandbytes NF4) to fit | |
| in the ZeroGPU tier's memory budget; the base Qwen3.6-35B-A3B activates only | |
| ~3B parameters per forward, so quantization cost on quality is small. | |
| The distilled model produces <think>...</think> chain-of-thought before the | |
| final answer. We surface that transparently: thinking is shown in a collapsed | |
| <details> block, final answer is the body of the chat message. This matches | |
| how Claude (the teacher) presents its reasoning. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import re | |
| import threading | |
| import gradio as gr | |
| import spaces # HF ZeroGPU shim — safe to import on CPU | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| TextIteratorStreamer, | |
| ) | |
| MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled" | |
| MAX_NEW_TOKENS = 2048 # demo latency cap; raise to 8192 for full reasoning | |
| # First call includes lazy model load (~2 min to 4-bit on GPU); subsequent | |
| # calls just generate. ZeroGPU keeps the loaded model resident across calls. | |
| GEN_DURATION_SECONDS = 300 | |
| DESCRIPTION = """\ | |
| # Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled | |
| **A 35B-parameter MoE (with only ~3B active per token) fine-tuned to imitate the chain-of-thought style of Claude Opus 4.7.** The model thinks in explicit `<think>…</think>` blocks before producing the final answer, same as frontier reasoning systems. | |
| > Running in 4-bit (NF4) on ZeroGPU. **First message of a session may take 2–3 minutes** (lazy model load of ~70GB on GPU). **Subsequent messages take 30–90 seconds** because the model genuinely emits thousands of thinking tokens — that's the reasoning distillation working as intended, not a bug. | |
| > | |
| > Responses capped at 2048 tokens for demo latency. For full-length reasoning, run the model locally with vLLM at 64k context. | |
| Model: [lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled](https://huggingface.co/lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled) | |
| """ | |
| EXAMPLES = [ | |
| ["How many positive integers less than 1000 have digits that sum to 20?"], | |
| ["Prove that for any positive integer n, the sum 1 + 2 + ... + n equals n(n+1)/2."], | |
| ["A snail climbs a 10m wall, going up 3m during the day and slipping 2m at night. How many days to reach the top?"], | |
| ["Explain, at a graduate level, why photosynthesis requires light of specific wavelengths."], | |
| ["Write a Python function that efficiently finds the k-th smallest element in a sorted matrix."], | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Lazy model load. ZeroGPU only attaches a GPU inside @spaces.GPU functions; | |
| # loading at module-import time would run on CPU-only, and bnb-4bit refuses | |
| # to offload any modules to CPU. So we defer the load until the first chat | |
| # call and keep module-level references; subsequent calls reuse the loaded | |
| # model (ZeroGPU keeps the process + its GPU state across requests). | |
| # --------------------------------------------------------------------------- | |
| _model = None | |
| _tokenizer = None | |
| def _ensure_model_loaded() -> None: | |
| """Load weights on the currently-attached GPU. MUST only be called from | |
| inside a @spaces.GPU-decorated function.""" | |
| global _model, _tokenizer | |
| if _model is not None: | |
| return | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| ) | |
| _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| if _tokenizer.pad_token_id is None: | |
| _tokenizer.pad_token_id = _tokenizer.eos_token_id | |
| # device_map="cuda" forces all layers onto the single attached GPU, no | |
| # CPU/disk offload. bnb-4bit only supports this mode. | |
| _model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| quantization_config=bnb_config, | |
| device_map="cuda", | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| _model.eval() | |
| # --------------------------------------------------------------------------- | |
| # Rendering helper: separate <think>…</think> from the final answer so the | |
| # user sees the answer prominently and can expand the thinking to inspect it. | |
| # --------------------------------------------------------------------------- | |
| _THINK_RE = re.compile(r"<think\b[^>]*>(.*?)</think>\s*", flags=re.DOTALL | re.IGNORECASE) | |
| def render_response(text: str) -> str: | |
| """Convert raw model output into Markdown with a collapsible thinking | |
| section. Handles streaming partials gracefully (unclosed <think> becomes | |
| a "Thinking…" spinner until the closing tag arrives).""" | |
| thinks = _THINK_RE.findall(text) | |
| answer = _THINK_RE.sub("", text).strip() | |
| blocks: list[str] = [] | |
| # Show any completed thinking blocks, collapsed. | |
| for i, t in enumerate(thinks, start=1): | |
| label = "Reasoning" if len(thinks) == 1 else f"Reasoning (step {i})" | |
| blocks.append( | |
| f"<details><summary>💭 {label}</summary>\n\n{t.strip()}\n\n</details>" | |
| ) | |
| # Still-streaming thinking (opened but not yet closed) — show as spinner. | |
| open_idx = text.rfind("<think") | |
| close_idx = text.rfind("</think>") | |
| if open_idx > close_idx: | |
| unclosed = text[open_idx:] | |
| # strip opening tag for display | |
| unclosed = re.sub(r"^<think\b[^>]*>", "", unclosed, flags=re.IGNORECASE) | |
| blocks.append( | |
| "<details open><summary>💭 Thinking…</summary>\n\n" | |
| f"{unclosed.strip()}\n\n</details>" | |
| ) | |
| if answer: | |
| blocks.append(answer) | |
| return "\n\n".join(blocks) or text | |
| # --------------------------------------------------------------------------- | |
| # Generation — wrapped in @spaces.GPU so ZeroGPU attaches a GPU for the call. | |
| # --------------------------------------------------------------------------- | |
| def chat(message: str, history: list[dict]): | |
| # Load weights on first call (on the attached GPU); no-op after that. | |
| _ensure_model_loaded() | |
| # `history` is already in OpenAI-style {"role","content"} form because | |
| # we set `type="messages"` on gr.ChatInterface below. | |
| messages: list[dict] = [] | |
| for turn in history: | |
| if turn.get("role") in ("user", "assistant") and turn.get("content"): | |
| messages.append({"role": turn["role"], "content": turn["content"]}) | |
| messages.append({"role": "user", "content": message}) | |
| # return_dict=True so we get a BatchEncoding with input_ids + attention_mask; | |
| # spreading with ** into generate() is correct both with and without mask. | |
| inputs = _tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| return_dict=True, | |
| ) | |
| inputs = { | |
| k: (v.to(_model.device) if hasattr(v, "to") else v) | |
| for k, v in inputs.items() | |
| } | |
| streamer = TextIteratorStreamer( | |
| _tokenizer, skip_prompt=True, skip_special_tokens=True | |
| ) | |
| gen_kwargs = dict( | |
| **inputs, | |
| streamer=streamer, | |
| max_new_tokens=MAX_NEW_TOKENS, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| repetition_penalty=1.0, | |
| ) | |
| thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs) | |
| thread.start() | |
| partial = "" | |
| for new_text in streamer: | |
| partial += new_text | |
| yield render_response(partial) | |
| thread.join(timeout=1.0) | |
| # --------------------------------------------------------------------------- | |
| # UI | |
| # --------------------------------------------------------------------------- | |
| with gr.Blocks(fill_height=True, title="Qwen3.6 · Claude-4.7-Opus Distilled") as demo: | |
| gr.Markdown(DESCRIPTION) | |
| gr.ChatInterface( | |
| chat, | |
| examples=EXAMPLES, | |
| cache_examples=False, | |
| fill_height=True, | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=32).launch() | |