Spaces:

lordx64
/

qwen3-6-distill-chat

Runtime error

App Files Files Community

lordx64 commited on 18 days ago

Commit

f08cae3

verified ·

1 Parent(s): 7033e34

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +45 -24

app.py CHANGED Viewed

@@ -28,7 +28,9 @@ from transformers import (
 MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
 MAX_NEW_TOKENS = 8192   # room for ~thinking + answer; long problems may truncate
-GEN_DURATION_SECONDS = 180  # ZeroGPU attach budget per call
 DESCRIPTION = """\
 # Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled
@@ -49,29 +51,45 @@ EXAMPLES = [
 ]
 # ---------------------------------------------------------------------------
-# Model load (happens on Space startup, once per replica). BnB 4-bit keeps
-# the weight footprint around ~18 GB, comfortable within ZeroGPU memory.
 # ---------------------------------------------------------------------------
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-if tokenizer.pad_token_id is None:
-    tokenizer.pad_token_id = tokenizer.eos_token_id
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    quantization_config=bnb_config,
-    device_map="auto",
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16,
-)
-model.eval()
 # ---------------------------------------------------------------------------
@@ -122,6 +140,9 @@ def render_response(text: str) -> str:
 @spaces.GPU(duration=GEN_DURATION_SECONDS)
 def chat(message: str, history: list[dict]):
     # `history` is already in OpenAI-style {"role","content"} form because
     # we set `type="messages"` on gr.ChatInterface below.
     messages: list[dict] = []
@@ -130,14 +151,14 @@ def chat(message: str, history: list[dict]):
             messages.append({"role": turn["role"], "content": turn["content"]})
     messages.append({"role": "user", "content": message})
-    prompt_ids = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         return_tensors="pt",
-    ).to(model.device)
     streamer = TextIteratorStreamer(
-        tokenizer, skip_prompt=True, skip_special_tokens=True
     )
     gen_kwargs = dict(
         input_ids=prompt_ids,
@@ -148,7 +169,7 @@ def chat(message: str, history: list[dict]):
         top_p=0.9,
         repetition_penalty=1.0,
     )
-    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
     partial = ""

 MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
 MAX_NEW_TOKENS = 8192   # room for ~thinking + answer; long problems may truncate
+# First call includes lazy model load (~2 min to 4-bit on GPU); subsequent
+# calls just generate. ZeroGPU keeps the loaded model resident across calls.
+GEN_DURATION_SECONDS = 300
 DESCRIPTION = """\
 # Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled
 ]
 # ---------------------------------------------------------------------------
+# Lazy model load. ZeroGPU only attaches a GPU inside @spaces.GPU functions;
+# loading at module-import time would run on CPU-only, and bnb-4bit refuses
+# to offload any modules to CPU. So we defer the load until the first chat
+# call and keep module-level references; subsequent calls reuse the loaded
+# model (ZeroGPU keeps the process + its GPU state across requests).
 # ---------------------------------------------------------------------------
+_model = None
+_tokenizer = None
+def _ensure_model_loaded() -> None:
+    """Load weights on the currently-attached GPU. MUST only be called from
+    inside a @spaces.GPU-decorated function."""
+    global _model, _tokenizer
+    if _model is not None:
+        return
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+    )
+    _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    if _tokenizer.pad_token_id is None:
+        _tokenizer.pad_token_id = _tokenizer.eos_token_id
+    # device_map="cuda" forces all layers onto the single attached GPU, no
+    # CPU/disk offload. bnb-4bit only supports this mode.
+    _model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        quantization_config=bnb_config,
+        device_map="cuda",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    _model.eval()
 # ---------------------------------------------------------------------------
 @spaces.GPU(duration=GEN_DURATION_SECONDS)
 def chat(message: str, history: list[dict]):
+    # Load weights on first call (on the attached GPU); no-op after that.
+    _ensure_model_loaded()
     # `history` is already in OpenAI-style {"role","content"} form because
     # we set `type="messages"` on gr.ChatInterface below.
     messages: list[dict] = []
             messages.append({"role": turn["role"], "content": turn["content"]})
     messages.append({"role": "user", "content": message})
+    prompt_ids = _tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         return_tensors="pt",
+    ).to(_model.device)
     streamer = TextIteratorStreamer(
+        _tokenizer, skip_prompt=True, skip_special_tokens=True
     )
     gen_kwargs = dict(
         input_ids=prompt_ids,
         top_p=0.9,
         repetition_penalty=1.0,
     )
+    thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
     thread.start()
     partial = ""