Great fit for 2x4060 - 83tok/s

#4
by mrchuy - opened

Thanks for another tight quant.

podman run -d \
  --name vllm-qwen36-35b-awq \
  --device nvidia.com/gpu=all \
  -v /data/models:/root/.cache/huggingface:Z \
  -v /data/vllm_cache:/root/.cache/vllm:Z \
  -p 8001:8000 \
  --env HF_HUB_OFFLINE=0 \
  --env TRANSFORMERS_OFFLINE=0 \
  --env HF_TOKEN=$HF_TOKEN \
  --ipc=host \
  -e NVIDIA_VISIBLE_DEVICES=all \
  -e LD_LIBRARY_PATH=/usr/lib64:/usr/local/nvidia/lib64 \
  -e VLLM_LOGGING_LEVEL=INFO \
  --restart=unless-stopped \
  docker.io/vllm/vllm-openai:latest \
  cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit \
  --gpu-memory-utilization 0.95 \
  --tensor-parallel-size 2 \
  --dtype half \
  --kv-cache-dtype fp8 \
  --max-model-len 131072 \
  --max-num-batched-tokens 2096 \
  --max-num-seqs 2 \
  --enable-prefix-caching \
  --compilation-config '{"cudagraph_capture_sizes":[1,2,4,8],"compile_sizes":[1,2,4,8]}' \
  --reasoning-parser qwen3 \
  --enable-auto-tool-choice \
  --tool-call-parser qwen3_coder \
  --enable-log-requests \
  --no-enable-log-outputs \
  --no-enable-log-deltas \
  --max-log-len 12000 \
  --disable-access-log-for-endpoints /health,/metrics,/ping \
  --host 0.0.0.0 \
  --port 8000

# Hermes toolcall seem to work with and without the 3.5 template. more testing todo for cache efficiency 
#   --chat-template /root/.cache/vllm/chat_template_dev.jinja \

(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299] 
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299]        β–ˆ     β–ˆ     β–ˆβ–„   β–„β–ˆ
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299]  β–„β–„ β–„β–ˆ β–ˆ     β–ˆ     β–ˆ β–€β–„β–€ β–ˆ  version 0.19.0
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299]   β–ˆβ–„β–ˆβ–€ β–ˆ     β–ˆ     β–ˆ     β–ˆ  model   cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299]    β–€β–€  β–€β–€β–€β–€β–€ β–€β–€β–€β–€β–€ β–€     β–€
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299] 
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:233] non-default args: {'model_tag': 'cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit', 'enable_auto_tool_choice': True, 'tool_call_parser': 'qwen3_coder', 'max_log_len': 12000, 'enable_log_deltas': False, 'host': '0.0.0.0', 'disable_access_log_for_endpoints': '/health,/metrics,/ping', 'model': 'cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit', 'dtype': 'half', 'max_model_len': 131072, 'reasoning_parser': 'qwen3', 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'kv_cache_dtype': 'fp8', 'enable_prefix_caching': True, 'max_num_batched_tokens': 2096, 'max_num_seqs': 2, 'compilation_config': {'mode': None, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': [], 'splitting_ops': None, 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [1, 2, 4, 8], 'compile_ranges_endpoints': None, 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': None, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [1, 2, 4, 8], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': None, 'pass_config': {}, 'max_cudagraph_capture_size': None, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': None, 'static_all_moe_layers': []}, 'enable_log_requests': True}
(APIServer pid=1) INFO 04-17 11:45:00 [model.py:549] Resolved architecture: Qwen3_5MoeForConditionalGeneration
(APIServer pid=1) WARNING 04-17 11:45:00 [model.py:2016] Casting torch.bfloat16 to torch.float16.
(APIServer pid=1) INFO 04-17 11:45:00 [model.py:1678] Using max model len 131072

(APIServer pid=1) INFO 04-17 11:49:59 [async_llm.py:420] Added request chatcmpl-b14d492a1c5816a6-bab5e844.
(APIServer pid=1) INFO 04-17 11:50:01 [logger.py:63] Received request chatcmpl-9d7b7bb8a7026509: params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=0.95, top_k=20, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=2000, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None), lora_request: None.
(APIServer pid=1) INFO 04-17 11:50:01 [async_llm.py:420] Added request chatcmpl-9d7b7bb8a7026509-8742bf3c.
(APIServer pid=1) INFO 04-17 11:50:07 [loggers.py:259] Engine 000: Avg prompt throughput: 424.9 tokens/s, Avg generation throughput: 55.6 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 2.6%, Prefix cache hit rate: 72.4%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:50:17 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 87.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 3.1%, Prefix cache hit rate: 72.4%, MM cache hit rate: 0.0%
...
(APIServer pid=1) INFO 04-17 11:50:45 [logger.py:63] Received request chatcmpl-9efdee3f97f1cfb3: params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=103814, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None), lora_request: None.
(APIServer pid=1) INFO 04-17 11:50:47 [loggers.py:259] Engine 000: Avg prompt throughput: 578.1 tokens/s, Avg generation throughput: 60.8 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 85.1%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:50:57 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.8 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 85.1%, MM cache hit rate: 0.0%

(APIServer pid=1) INFO 04-17 11:51:04 [logger.py:63] Received request chatcmpl-b7f0110e09cfa00f: params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=103645, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None), lora_request: None.
(APIServer pid=1) INFO 04-17 11:51:07 [loggers.py:259] Engine 000: Avg prompt throughput: 17.9 tokens/s, Avg generation throughput: 81.9 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:17 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.7 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:27 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.7 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:37 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.6 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 9.4%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:47 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.3 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 9.4%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:57 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 9.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:52:07 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 9.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
Fri Apr 17 04:59:06 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03              Driver Version: 595.58.03      CUDA Version: 13.2     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4060 Ti     Off |   00000000:04:00.0 Off |                  N/A |
| 46%   66C    P2            117W /  165W |   15592MiB /  16380MiB |     98%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 4060 Ti     Off |   00000000:0A:00.0 Off |                  N/A |
| 68%   74C    P2            121W /  165W |   15592MiB /  16380MiB |     99%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A          500547      C   VLLM::Worker_TP0                      15582MiB |
|    1   N/A  N/A          500548      C   VLLM::Worker_TP1                      15582MiB |
+-----------------------------------------------------------------------------------------+

What tool was used to quantize this awq model?

Sign up or log in to comment