Great fit for 2x4060 - 83tok/s
#4
by mrchuy - opened
Thanks for another tight quant.
podman run -d \
--name vllm-qwen36-35b-awq \
--device nvidia.com/gpu=all \
-v /data/models:/root/.cache/huggingface:Z \
-v /data/vllm_cache:/root/.cache/vllm:Z \
-p 8001:8000 \
--env HF_HUB_OFFLINE=0 \
--env TRANSFORMERS_OFFLINE=0 \
--env HF_TOKEN=$HF_TOKEN \
--ipc=host \
-e NVIDIA_VISIBLE_DEVICES=all \
-e LD_LIBRARY_PATH=/usr/lib64:/usr/local/nvidia/lib64 \
-e VLLM_LOGGING_LEVEL=INFO \
--restart=unless-stopped \
docker.io/vllm/vllm-openai:latest \
cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit \
--gpu-memory-utilization 0.95 \
--tensor-parallel-size 2 \
--dtype half \
--kv-cache-dtype fp8 \
--max-model-len 131072 \
--max-num-batched-tokens 2096 \
--max-num-seqs 2 \
--enable-prefix-caching \
--compilation-config '{"cudagraph_capture_sizes":[1,2,4,8],"compile_sizes":[1,2,4,8]}' \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--enable-log-requests \
--no-enable-log-outputs \
--no-enable-log-deltas \
--max-log-len 12000 \
--disable-access-log-for-endpoints /health,/metrics,/ping \
--host 0.0.0.0 \
--port 8000
# Hermes toolcall seem to work with and without the 3.5 template. more testing todo for cache efficiency
# --chat-template /root/.cache/vllm/chat_template_dev.jinja \
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299]
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299] β β ββ ββ
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299] ββ ββ β β β βββ β version 0.19.0
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299] ββββ β β β β model cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299] ββ βββββ βββββ β β
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:299]
(APIServer pid=1) INFO 04-17 11:44:59 [utils.py:233] non-default args: {'model_tag': 'cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit', 'enable_auto_tool_choice': True, 'tool_call_parser': 'qwen3_coder', 'max_log_len': 12000, 'enable_log_deltas': False, 'host': '0.0.0.0', 'disable_access_log_for_endpoints': '/health,/metrics,/ping', 'model': 'cyankiwi/Qwen3.6-35B-A3B-AWQ-4bit', 'dtype': 'half', 'max_model_len': 131072, 'reasoning_parser': 'qwen3', 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'kv_cache_dtype': 'fp8', 'enable_prefix_caching': True, 'max_num_batched_tokens': 2096, 'max_num_seqs': 2, 'compilation_config': {'mode': None, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': [], 'splitting_ops': None, 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [1, 2, 4, 8], 'compile_ranges_endpoints': None, 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': None, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [1, 2, 4, 8], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': None, 'pass_config': {}, 'max_cudagraph_capture_size': None, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': None, 'static_all_moe_layers': []}, 'enable_log_requests': True}
(APIServer pid=1) INFO 04-17 11:45:00 [model.py:549] Resolved architecture: Qwen3_5MoeForConditionalGeneration
(APIServer pid=1) WARNING 04-17 11:45:00 [model.py:2016] Casting torch.bfloat16 to torch.float16.
(APIServer pid=1) INFO 04-17 11:45:00 [model.py:1678] Using max model len 131072
(APIServer pid=1) INFO 04-17 11:49:59 [async_llm.py:420] Added request chatcmpl-b14d492a1c5816a6-bab5e844.
(APIServer pid=1) INFO 04-17 11:50:01 [logger.py:63] Received request chatcmpl-9d7b7bb8a7026509: params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.1, top_p=0.95, top_k=20, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=2000, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None), lora_request: None.
(APIServer pid=1) INFO 04-17 11:50:01 [async_llm.py:420] Added request chatcmpl-9d7b7bb8a7026509-8742bf3c.
(APIServer pid=1) INFO 04-17 11:50:07 [loggers.py:259] Engine 000: Avg prompt throughput: 424.9 tokens/s, Avg generation throughput: 55.6 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 2.6%, Prefix cache hit rate: 72.4%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:50:17 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 87.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 3.1%, Prefix cache hit rate: 72.4%, MM cache hit rate: 0.0%
...
(APIServer pid=1) INFO 04-17 11:50:45 [logger.py:63] Received request chatcmpl-9efdee3f97f1cfb3: params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=103814, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None), lora_request: None.
(APIServer pid=1) INFO 04-17 11:50:47 [loggers.py:259] Engine 000: Avg prompt throughput: 578.1 tokens/s, Avg generation throughput: 60.8 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 85.1%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:50:57 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.8 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 85.1%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:04 [logger.py:63] Received request chatcmpl-b7f0110e09cfa00f: params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], thinking_token_budget=None, include_stop_str_in_output=False, ignore_eos=False, max_tokens=103645, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, structured_outputs=None, extra_args=None), lora_request: None.
(APIServer pid=1) INFO 04-17 11:51:07 [loggers.py:259] Engine 000: Avg prompt throughput: 17.9 tokens/s, Avg generation throughput: 81.9 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:17 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.7 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:27 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.7 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 8.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:37 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.6 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 9.4%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:47 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.3 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 9.4%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:51:57 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 9.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
(APIServer pid=1) INFO 04-17 11:52:07 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 83.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 9.9%, Prefix cache hit rate: 86.3%, MM cache hit rate: 0.0%
Fri Apr 17 04:59:06 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03 Driver Version: 595.58.03 CUDA Version: 13.2 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4060 Ti Off | 00000000:04:00.0 Off | N/A |
| 46% 66C P2 117W / 165W | 15592MiB / 16380MiB | 98% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA GeForce RTX 4060 Ti Off | 00000000:0A:00.0 Off | N/A |
| 68% 74C P2 121W / 165W | 15592MiB / 16380MiB | 99% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 500547 C VLLM::Worker_TP0 15582MiB |
| 1 N/A N/A 500548 C VLLM::Worker_TP1 15582MiB |
+-----------------------------------------------------------------------------------------+
What tool was used to quantize this awq model?