--- tags: - moe - mobile - architecture-search - qwen3 license: apache-2.0 --- # large-moe-1.3b_lr5e-05 Mobile-optimized MoE model configuration from architecture search. ## Metrics - Best Training Loss: 2.3534 - Best Eval Loss: 5.0875 - Total Parameters: 1083.2M - Active Parameters: 781.2M - Steps Completed: 2000 ## Usage ```python # Load the model from safetensors.torch import load_file from architecture.model import Qwen3Model # Your custom model class # Load config import json with open("config.json") as f: config = json.load(f) # Initialize model model = Qwen3Model(config) # Load weights state_dict = load_file("model.safetensors") model.load_state_dict(state_dict) # Load tokenizer from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("kshitijthakkar/moe-1083m-781m-16x8-8L-large-moe-1.3b-lr5e-05") ``` ## Model Configuration ```json { "vocab_size": 151936, "emb_dim": 2048, "n_heads": 32, "n_layers": 8, "n_kv_groups": 8, "num_experts": 16, "num_experts_per_tok": 8, "moe_hidden_dim": 768, "head_dim": 128, "max_position_embeddings": 262144, "rope_base": 1000000.0, "qk_norm": true } ``` ## Training Configuration ```json { "model_config": { "vocab_size": 151936, "emb_dim": 2048, "n_heads": 32, "n_layers": 8, "n_kv_groups": 8, "num_experts": 16, "num_experts_per_tok": 8, "moe_hidden_dim": 768, "head_dim": 128, "max_position_embeddings": 262144, "rope_base": 1000000.0, "qk_norm": true }, "learning_rate": 5e-05, "batch_size": 4, "context_length": 1024, "warmup_ratio": 0.1, "warmup_steps": null, "weight_decay": 0.1, "gradient_clip": 1.0, "gradient_accumulation_steps": 1, "scheduler_type": "cosine", "wsd_decay_ratio": 0.1, "max_steps": 2000, "eval_steps": 500, "eval_batches": 20, "log_steps": 100, "early_stopping": true, "early_stopping_patience": 500, "early_stopping_min_delta": 0.01, "early_stopping_min_steps": 200, "track_expert_balance": true, "expert_balance_log_steps": 100, "use_wandb": true, "wandb_project": "moe-large-search", "wandb_entity": null, "wandb_tags": [ "large-moe-1.3b_lr5e-05", "architecture-search" ], "train_data_path": null, "val_data_path": null, "output_dir": null, "experiment_name": "large-moe-1.3b_lr5e-05", "device": "cuda", "dtype": "bfloat16", "gradient_checkpointing": true, "architecture_name": "large-moe-1.3b_lr5e-05", "performance_estimate": { "total_params": 1083213824, "active_params": 781223936, "scale": "large", "platform": "Datacenter GPU (A100 80GB)", "throughput_tps": 1011.4705045314754, "ttft_ms": 16.24820736, "memory_gb": 2.0779457092285156, "meets_throughput_target": true, "meets_ttft_target": true, "meets_memory_target": true, "gpu_a100_tps": 1011.4705045314754, "gpu_h100_tps": 1820.6469081566559, "gpu_4090_tps": 404.58820181259017, "mobile_q8_tps": null } } ```