Large MoE Architecture Search (1B-2B)
Collection
Systematic search for 1B-2B MoE models. Best: bs=1, ctx=2048 achieves 0.32 loss. Top-8 routing beats top-2. • 20 items • Updated
Mobile-optimized MoE model configuration from architecture search.
# Load the model
from safetensors.torch import load_file
from architecture.model import Qwen3Model # Your custom model class
# Load config
import json
with open("config.json") as f:
config = json.load(f)
# Initialize model
model = Qwen3Model(config)
# Load weights
state_dict = load_file("model.safetensors")
model.load_state_dict(state_dict)
# Load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("kshitijthakkar/moe-1083m-781m-16x8-8L-large-moe-1.3b-lr5e-06")
{
"vocab_size": 151936,
"emb_dim": 2048,
"n_heads": 32,
"n_layers": 8,
"n_kv_groups": 8,
"num_experts": 16,
"num_experts_per_tok": 8,
"moe_hidden_dim": 768,
"head_dim": 128,
"max_position_embeddings": 262144,
"rope_base": 1000000.0,
"qk_norm": true
}
{
"model_config": {
"vocab_size": 151936,
"emb_dim": 2048,
"n_heads": 32,
"n_layers": 8,
"n_kv_groups": 8,
"num_experts": 16,
"num_experts_per_tok": 8,
"moe_hidden_dim": 768,
"head_dim": 128,
"max_position_embeddings": 262144,
"rope_base": 1000000.0,
"qk_norm": true
},
"learning_rate": 5e-06,
"batch_size": 4,
"context_length": 1024,
"warmup_ratio": 0.1,
"warmup_steps": null,
"weight_decay": 0.1,
"gradient_clip": 1.0,
"gradient_accumulation_steps": 1,
"scheduler_type": "cosine",
"wsd_decay_ratio": 0.1,
"max_steps": 2000,
"eval_steps": 500,
"eval_batches": 20,
"log_steps": 100,
"early_stopping": true,
"early_stopping_patience": 500,
"early_stopping_min_delta": 0.01,
"early_stopping_min_steps": 200,
"track_expert_balance": true,
"expert_balance_log_steps": 100,
"use_wandb": true,
"wandb_project": "moe-large-search",
"wandb_entity": null,
"wandb_tags": [
"large-moe-1.3b_lr5e-06",
"architecture-search"
],
"train_data_path": null,
"val_data_path": null,
"output_dir": null,
"experiment_name": "large-moe-1.3b_lr5e-06",
"device": "cuda",
"dtype": "bfloat16",
"gradient_checkpointing": true,
"architecture_name": "large-moe-1.3b_lr5e-06",
"performance_estimate": {
"total_params": 1083213824,
"active_params": 781223936,
"scale": "large",
"platform": "Datacenter GPU (A100 80GB)",
"throughput_tps": 1011.4705045314754,
"ttft_ms": 16.24820736,
"memory_gb": 2.0779457092285156,
"meets_throughput_target": true,
"meets_ttft_target": true,
"meets_memory_target": true,
"gpu_a100_tps": 1011.4705045314754,
"gpu_h100_tps": 1820.6469081566559,
"gpu_4090_tps": 404.58820181259017,
"mobile_q8_tps": null
}
}