kshitijthakkar's picture
Upload README.md with huggingface_hub
5b96c0c verified
metadata
tags:
  - moe
  - mobile
  - architecture-search
  - qwen3
license: apache-2.0

xlarge_450m_16exp

Mobile-optimized MoE model configuration from architecture search.

Metrics

  • Best Training Loss: 2.7313
  • Best Eval Loss: 5.4750
  • Total Parameters: 415.9M
  • Active Parameters: 147.5M
  • Steps Completed: 1875

Usage

# Load the model
from safetensors.torch import load_file
from architecture.model import Qwen3Model  # Your custom model class

# Load config
import json
with open("config.json") as f:
    config = json.load(f)

# Initialize model
model = Qwen3Model(config)

# Load weights
state_dict = load_file("model.safetensors")
model.load_state_dict(state_dict)

# Load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("kshitijthakkar/moe-415m-147m-16x2-12L-xlarge-450m-16exp")

Model Configuration

{
  "vocab_size": 151936,
  "emb_dim": 640,
  "n_heads": 10,
  "n_layers": 12,
  "n_kv_groups": 2,
  "num_experts": 16,
  "num_experts_per_tok": 2,
  "moe_hidden_dim": 832,
  "head_dim": 64,
  "max_position_embeddings": 4096,
  "rope_base": 1000000.0,
  "qk_norm": true
}

Training Configuration

{
  "model_config": {
    "vocab_size": 151936,
    "emb_dim": 640,
    "n_heads": 10,
    "n_layers": 12,
    "n_kv_groups": 2,
    "num_experts": 16,
    "num_experts_per_tok": 2,
    "moe_hidden_dim": 832,
    "head_dim": 64,
    "max_position_embeddings": 4096,
    "rope_base": 1000000.0,
    "qk_norm": true
  },
  "learning_rate": 0.0001,
  "batch_size": 4,
  "context_length": 1024,
  "warmup_ratio": 0.1,
  "warmup_steps": null,
  "weight_decay": 0.1,
  "gradient_clip": 1.0,
  "gradient_accumulation_steps": 1,
  "scheduler_type": "cosine",
  "wsd_decay_ratio": 0.1,
  "max_steps": 2000,
  "eval_steps": 500,
  "eval_batches": 20,
  "log_steps": 100,
  "early_stopping": true,
  "early_stopping_patience": 500,
  "early_stopping_min_delta": 0.01,
  "early_stopping_min_steps": 200,
  "track_expert_balance": true,
  "expert_balance_log_steps": 100,
  "use_wandb": true,
  "wandb_project": "moe-architecture-search",
  "wandb_entity": null,
  "wandb_tags": [
    "xlarge_450m_16exp",
    "architecture-search"
  ],
  "train_data_path": null,
  "val_data_path": null,
  "output_dir": null,
  "experiment_name": "xlarge_450m_16exp",
  "device": "cuda",
  "dtype": "bfloat16",
  "gradient_checkpointing": true,
  "architecture_name": "xlarge_450m_16exp",
  "mobile_estimate": {
    "tok_per_sec_fp16": 32.99718991005406,
    "tok_per_sec_q8": 54.995316516756766,
    "tok_per_sec_q4": 76.99344312345947,
    "ttft_ms_fp16": 151.23013818181818,
    "ttft_ms_q8": 100.82009212121213,
    "ttft_ms_q4": 80.65607369696971,
    "memory_mb_fp16": 852.718017578125,
    "memory_mb_q8": 491.0206909179688,
    "memory_mb_q4": 290.3411865234375,
    "total_params": 415882880,
    "active_params": 147512960,
    "meets_ttft_target": false,
    "meets_throughput_target": true,
    "meets_memory_target": true
  }
}