lordx64 commited on
Commit
f08cae3
·
verified ·
1 Parent(s): 7033e34

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +45 -24
app.py CHANGED
@@ -28,7 +28,9 @@ from transformers import (
28
 
29
  MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
30
  MAX_NEW_TOKENS = 8192 # room for ~thinking + answer; long problems may truncate
31
- GEN_DURATION_SECONDS = 180 # ZeroGPU attach budget per call
 
 
32
 
33
  DESCRIPTION = """\
34
  # Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled
@@ -49,29 +51,45 @@ EXAMPLES = [
49
  ]
50
 
51
  # ---------------------------------------------------------------------------
52
- # Model load (happens on Space startup, once per replica). BnB 4-bit keeps
53
- # the weight footprint around ~18 GB, comfortable within ZeroGPU memory.
 
 
 
54
  # ---------------------------------------------------------------------------
55
 
56
- bnb_config = BitsAndBytesConfig(
57
- load_in_4bit=True,
58
- bnb_4bit_compute_dtype=torch.bfloat16,
59
- bnb_4bit_use_double_quant=True,
60
- bnb_4bit_quant_type="nf4",
61
- )
62
 
63
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
64
- if tokenizer.pad_token_id is None:
65
- tokenizer.pad_token_id = tokenizer.eos_token_id
66
 
67
- model = AutoModelForCausalLM.from_pretrained(
68
- MODEL_ID,
69
- quantization_config=bnb_config,
70
- device_map="auto",
71
- trust_remote_code=True,
72
- torch_dtype=torch.bfloat16,
73
- )
74
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  # ---------------------------------------------------------------------------
@@ -122,6 +140,9 @@ def render_response(text: str) -> str:
122
 
123
  @spaces.GPU(duration=GEN_DURATION_SECONDS)
124
  def chat(message: str, history: list[dict]):
 
 
 
125
  # `history` is already in OpenAI-style {"role","content"} form because
126
  # we set `type="messages"` on gr.ChatInterface below.
127
  messages: list[dict] = []
@@ -130,14 +151,14 @@ def chat(message: str, history: list[dict]):
130
  messages.append({"role": turn["role"], "content": turn["content"]})
131
  messages.append({"role": "user", "content": message})
132
 
133
- prompt_ids = tokenizer.apply_chat_template(
134
  messages,
135
  add_generation_prompt=True,
136
  return_tensors="pt",
137
- ).to(model.device)
138
 
139
  streamer = TextIteratorStreamer(
140
- tokenizer, skip_prompt=True, skip_special_tokens=True
141
  )
142
  gen_kwargs = dict(
143
  input_ids=prompt_ids,
@@ -148,7 +169,7 @@ def chat(message: str, history: list[dict]):
148
  top_p=0.9,
149
  repetition_penalty=1.0,
150
  )
151
- thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
152
  thread.start()
153
 
154
  partial = ""
 
28
 
29
  MODEL_ID = "lordx64/Qwen3.6-35B-A3B-Claude-4.7-Opus-Reasoning-Distilled"
30
  MAX_NEW_TOKENS = 8192 # room for ~thinking + answer; long problems may truncate
31
+ # First call includes lazy model load (~2 min to 4-bit on GPU); subsequent
32
+ # calls just generate. ZeroGPU keeps the loaded model resident across calls.
33
+ GEN_DURATION_SECONDS = 300
34
 
35
  DESCRIPTION = """\
36
  # Qwen3.6-35B-A3B · Claude-4.7-Opus Reasoning Distilled
 
51
  ]
52
 
53
  # ---------------------------------------------------------------------------
54
+ # Lazy model load. ZeroGPU only attaches a GPU inside @spaces.GPU functions;
55
+ # loading at module-import time would run on CPU-only, and bnb-4bit refuses
56
+ # to offload any modules to CPU. So we defer the load until the first chat
57
+ # call and keep module-level references; subsequent calls reuse the loaded
58
+ # model (ZeroGPU keeps the process + its GPU state across requests).
59
  # ---------------------------------------------------------------------------
60
 
61
+ _model = None
62
+ _tokenizer = None
 
 
 
 
63
 
 
 
 
64
 
65
+ def _ensure_model_loaded() -> None:
66
+ """Load weights on the currently-attached GPU. MUST only be called from
67
+ inside a @spaces.GPU-decorated function."""
68
+ global _model, _tokenizer
69
+ if _model is not None:
70
+ return
71
+
72
+ bnb_config = BitsAndBytesConfig(
73
+ load_in_4bit=True,
74
+ bnb_4bit_compute_dtype=torch.bfloat16,
75
+ bnb_4bit_use_double_quant=True,
76
+ bnb_4bit_quant_type="nf4",
77
+ )
78
+
79
+ _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
80
+ if _tokenizer.pad_token_id is None:
81
+ _tokenizer.pad_token_id = _tokenizer.eos_token_id
82
+
83
+ # device_map="cuda" forces all layers onto the single attached GPU, no
84
+ # CPU/disk offload. bnb-4bit only supports this mode.
85
+ _model = AutoModelForCausalLM.from_pretrained(
86
+ MODEL_ID,
87
+ quantization_config=bnb_config,
88
+ device_map="cuda",
89
+ trust_remote_code=True,
90
+ torch_dtype=torch.bfloat16,
91
+ )
92
+ _model.eval()
93
 
94
 
95
  # ---------------------------------------------------------------------------
 
140
 
141
  @spaces.GPU(duration=GEN_DURATION_SECONDS)
142
  def chat(message: str, history: list[dict]):
143
+ # Load weights on first call (on the attached GPU); no-op after that.
144
+ _ensure_model_loaded()
145
+
146
  # `history` is already in OpenAI-style {"role","content"} form because
147
  # we set `type="messages"` on gr.ChatInterface below.
148
  messages: list[dict] = []
 
151
  messages.append({"role": turn["role"], "content": turn["content"]})
152
  messages.append({"role": "user", "content": message})
153
 
154
+ prompt_ids = _tokenizer.apply_chat_template(
155
  messages,
156
  add_generation_prompt=True,
157
  return_tensors="pt",
158
+ ).to(_model.device)
159
 
160
  streamer = TextIteratorStreamer(
161
+ _tokenizer, skip_prompt=True, skip_special_tokens=True
162
  )
163
  gen_kwargs = dict(
164
  input_ids=prompt_ids,
 
169
  top_p=0.9,
170
  repetition_penalty=1.0,
171
  )
172
+ thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
173
  thread.start()
174
 
175
  partial = ""