--- tags: - moe - mobile - architecture-search - qwen3 license: apache-2.0 --- # xlarge_450m_16exp Mobile-optimized MoE model configuration from architecture search. ## Metrics - Best Training Loss: 2.7313 - Best Eval Loss: 5.4750 - Total Parameters: 415.9M - Active Parameters: 147.5M - Steps Completed: 1875 ## Usage ```python # Load the model from safetensors.torch import load_file from architecture.model import Qwen3Model # Your custom model class # Load config import json with open("config.json") as f: config = json.load(f) # Initialize model model = Qwen3Model(config) # Load weights state_dict = load_file("model.safetensors") model.load_state_dict(state_dict) # Load tokenizer from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("kshitijthakkar/moe-415m-147m-16x2-12L-xlarge-450m-16exp") ``` ## Model Configuration ```json { "vocab_size": 151936, "emb_dim": 640, "n_heads": 10, "n_layers": 12, "n_kv_groups": 2, "num_experts": 16, "num_experts_per_tok": 2, "moe_hidden_dim": 832, "head_dim": 64, "max_position_embeddings": 4096, "rope_base": 1000000.0, "qk_norm": true } ``` ## Training Configuration ```json { "model_config": { "vocab_size": 151936, "emb_dim": 640, "n_heads": 10, "n_layers": 12, "n_kv_groups": 2, "num_experts": 16, "num_experts_per_tok": 2, "moe_hidden_dim": 832, "head_dim": 64, "max_position_embeddings": 4096, "rope_base": 1000000.0, "qk_norm": true }, "learning_rate": 0.0001, "batch_size": 4, "context_length": 1024, "warmup_ratio": 0.1, "warmup_steps": null, "weight_decay": 0.1, "gradient_clip": 1.0, "gradient_accumulation_steps": 1, "scheduler_type": "cosine", "wsd_decay_ratio": 0.1, "max_steps": 2000, "eval_steps": 500, "eval_batches": 20, "log_steps": 100, "early_stopping": true, "early_stopping_patience": 500, "early_stopping_min_delta": 0.01, "early_stopping_min_steps": 200, "track_expert_balance": true, "expert_balance_log_steps": 100, "use_wandb": true, "wandb_project": "moe-architecture-search", "wandb_entity": null, "wandb_tags": [ "xlarge_450m_16exp", "architecture-search" ], "train_data_path": null, "val_data_path": null, "output_dir": null, "experiment_name": "xlarge_450m_16exp", "device": "cuda", "dtype": "bfloat16", "gradient_checkpointing": true, "architecture_name": "xlarge_450m_16exp", "mobile_estimate": { "tok_per_sec_fp16": 32.99718991005406, "tok_per_sec_q8": 54.995316516756766, "tok_per_sec_q4": 76.99344312345947, "ttft_ms_fp16": 151.23013818181818, "ttft_ms_q8": 100.82009212121213, "ttft_ms_q4": 80.65607369696971, "memory_mb_fp16": 852.718017578125, "memory_mb_q8": 491.0206909179688, "memory_mb_q4": 290.3411865234375, "total_params": 415882880, "active_params": 147512960, "meets_ttft_target": false, "meets_throughput_target": true, "meets_memory_target": true } } ```