--- tags: - moe - mobile - architecture-search - qwen3 license: apache-2.0 --- # full_attention_no_gqa_bs4_ctx512 Mobile-optimized MoE model configuration from architecture search. ## Metrics - Best Training Loss: 1.8680 - Best Eval Loss: 5.7672 - Total Parameters: 255.6M - Active Parameters: 114.1M - Steps Completed: 2000 ## Usage ```python # Load the model from safetensors.torch import load_file from architecture.model import Qwen3Model # Your custom model class # Load config import json with open("config.json") as f: config = json.load(f) # Initialize model model = Qwen3Model(config) # Load weights state_dict = load_file("model.safetensors") model.load_state_dict(state_dict) # Load tokenizer from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("kshitijthakkar/moe-255m-114m-12x2-12L-full-attention-no-gqa-bs4-ctx512") ``` ## Model Configuration ```json { "vocab_size": 151936, "emb_dim": 512, "n_heads": 8, "n_layers": 12, "n_kv_groups": 2, "num_experts": 12, "num_experts_per_tok": 2, "moe_hidden_dim": 768, "head_dim": 64, "max_position_embeddings": 4096, "rope_base": 1000000.0, "qk_norm": true } ``` ## Training Configuration ```json { "model_config": { "vocab_size": 151936, "emb_dim": 512, "n_heads": 8, "n_layers": 12, "n_kv_groups": 2, "num_experts": 12, "num_experts_per_tok": 2, "moe_hidden_dim": 768, "head_dim": 64, "max_position_embeddings": 4096, "rope_base": 1000000.0, "qk_norm": true }, "learning_rate": 0.0001, "batch_size": 4, "context_length": 512, "warmup_ratio": 0.1, "warmup_steps": null, "weight_decay": 0.1, "gradient_clip": 1.0, "gradient_accumulation_steps": 1, "scheduler_type": "cosine", "wsd_decay_ratio": 0.1, "max_steps": 2000, "eval_steps": 500, "eval_batches": 20, "log_steps": 100, "early_stopping": true, "early_stopping_patience": 500, "early_stopping_min_delta": 0.01, "early_stopping_min_steps": 200, "track_expert_balance": true, "expert_balance_log_steps": 100, "use_wandb": true, "wandb_project": "moe-architecture-search", "wandb_entity": null, "wandb_tags": [ "full_attention_no_gqa_bs4_ctx512", "architecture-search" ], "train_data_path": null, "val_data_path": null, "output_dir": null, "experiment_name": "full_attention_no_gqa_bs4_ctx512", "device": "cuda", "dtype": "bfloat16", "gradient_checkpointing": true, "architecture_name": "full_attention_no_gqa_bs4_ctx512", "mobile_estimate": { "tok_per_sec_fp16": 41.061977007248444, "tok_per_sec_q8": 68.43662834541408, "tok_per_sec_q4": 95.8112796835797, "ttft_ms_fp16": 92.9495970909091, "ttft_ms_q8": 61.96639806060606, "ttft_ms_q4": 49.57311844848485, "memory_mb_fp16": 545.8759765625, "memory_mb_q8": 322.31499023437505, "memory_mb_q4": 198.34599609375, "total_params": 255611392, "active_params": 114053632, "meets_ttft_target": true, "meets_throughput_target": true, "meets_memory_target": true } } ```