Text Generation
Transformers
Safetensors
multilingual
gemma4
image-text-to-text
nvidia
nvfp4
modelopt
quantized
Mixture of Experts
dgx-spark
blackwell
W4A16
post-training-quantization
conversational
8-bit precision
Instructions to use bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForMultimodalLM processor = AutoProcessor.from_pretrained("bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16") model = AutoModelForMultimodalLM.from_pretrained("bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16
- SGLang
How to use bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16 with Docker Model Runner:
docker model run hf.co/bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16
Upload gemma4_patched.py with huggingface_hub
Browse files- gemma4_patched.py +0 -5
gemma4_patched.py
CHANGED
|
@@ -1027,9 +1027,6 @@ class Gemma4Model(nn.Module):
|
|
| 1027 |
if is_pp_missing_parameter(moe_name, self):
|
| 1028 |
continue
|
| 1029 |
param = params_dict[moe_name]
|
| 1030 |
-
# Debug: log first expert of first layer
|
| 1031 |
-
if expert_id == 0 and "layers.0." in name:
|
| 1032 |
-
print(f"[PATCH LOAD] {name} -> {moe_name} | wl={wl_name} shard={shard_id} eid={expert_id} | weight:{list(loaded_weight.shape)} param:{list(param.shape)}")
|
| 1033 |
weight_loader = param.weight_loader
|
| 1034 |
weight_loader(
|
| 1035 |
param,
|
|
@@ -1049,7 +1046,6 @@ class Gemma4Model(nn.Module):
|
|
| 1049 |
if is_pp_missing_parameter(name, self):
|
| 1050 |
continue
|
| 1051 |
if name not in params_dict:
|
| 1052 |
-
print(f"[PATCH DEBUG] Skipping unknown key: {name}")
|
| 1053 |
continue
|
| 1054 |
param = params_dict[name]
|
| 1055 |
weight_loader = getattr(
|
|
@@ -1060,7 +1056,6 @@ class Gemma4Model(nn.Module):
|
|
| 1060 |
import inspect
|
| 1061 |
sig = inspect.signature(weight_loader)
|
| 1062 |
if len(sig.parameters) > 3 and "expert_id" in sig.parameters:
|
| 1063 |
-
print(f"[PATCH DEBUG] Skipping FusedMoE param that fell through mapping: {name}")
|
| 1064 |
continue
|
| 1065 |
weight_loader(param, loaded_weight)
|
| 1066 |
loaded_params.add(name)
|
|
|
|
| 1027 |
if is_pp_missing_parameter(moe_name, self):
|
| 1028 |
continue
|
| 1029 |
param = params_dict[moe_name]
|
|
|
|
|
|
|
|
|
|
| 1030 |
weight_loader = param.weight_loader
|
| 1031 |
weight_loader(
|
| 1032 |
param,
|
|
|
|
| 1046 |
if is_pp_missing_parameter(name, self):
|
| 1047 |
continue
|
| 1048 |
if name not in params_dict:
|
|
|
|
| 1049 |
continue
|
| 1050 |
param = params_dict[name]
|
| 1051 |
weight_loader = getattr(
|
|
|
|
| 1056 |
import inspect
|
| 1057 |
sig = inspect.signature(weight_loader)
|
| 1058 |
if len(sig.parameters) > 3 and "expert_id" in sig.parameters:
|
|
|
|
| 1059 |
continue
|
| 1060 |
weight_loader(param, loaded_weight)
|
| 1061 |
loaded_params.add(name)
|