kshitijthakkar's picture
Upload README.md with huggingface_hub
fad50d3 verified
metadata
tags:
  - moe
  - mobile
  - architecture-search
  - qwen3
license: apache-2.0

large-moe-1.3b_lr5e-05

Mobile-optimized MoE model configuration from architecture search.

Metrics

  • Best Training Loss: 2.3534
  • Best Eval Loss: 5.0875
  • Total Parameters: 1083.2M
  • Active Parameters: 781.2M
  • Steps Completed: 2000

Usage

# Load the model
from safetensors.torch import load_file
from architecture.model import Qwen3Model  # Your custom model class

# Load config
import json
with open("config.json") as f:
    config = json.load(f)

# Initialize model
model = Qwen3Model(config)

# Load weights
state_dict = load_file("model.safetensors")
model.load_state_dict(state_dict)

# Load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("kshitijthakkar/moe-1083m-781m-16x8-8L-large-moe-1.3b-lr5e-05")

Model Configuration

{
  "vocab_size": 151936,
  "emb_dim": 2048,
  "n_heads": 32,
  "n_layers": 8,
  "n_kv_groups": 8,
  "num_experts": 16,
  "num_experts_per_tok": 8,
  "moe_hidden_dim": 768,
  "head_dim": 128,
  "max_position_embeddings": 262144,
  "rope_base": 1000000.0,
  "qk_norm": true
}

Training Configuration

{
  "model_config": {
    "vocab_size": 151936,
    "emb_dim": 2048,
    "n_heads": 32,
    "n_layers": 8,
    "n_kv_groups": 8,
    "num_experts": 16,
    "num_experts_per_tok": 8,
    "moe_hidden_dim": 768,
    "head_dim": 128,
    "max_position_embeddings": 262144,
    "rope_base": 1000000.0,
    "qk_norm": true
  },
  "learning_rate": 5e-05,
  "batch_size": 4,
  "context_length": 1024,
  "warmup_ratio": 0.1,
  "warmup_steps": null,
  "weight_decay": 0.1,
  "gradient_clip": 1.0,
  "gradient_accumulation_steps": 1,
  "scheduler_type": "cosine",
  "wsd_decay_ratio": 0.1,
  "max_steps": 2000,
  "eval_steps": 500,
  "eval_batches": 20,
  "log_steps": 100,
  "early_stopping": true,
  "early_stopping_patience": 500,
  "early_stopping_min_delta": 0.01,
  "early_stopping_min_steps": 200,
  "track_expert_balance": true,
  "expert_balance_log_steps": 100,
  "use_wandb": true,
  "wandb_project": "moe-large-search",
  "wandb_entity": null,
  "wandb_tags": [
    "large-moe-1.3b_lr5e-05",
    "architecture-search"
  ],
  "train_data_path": null,
  "val_data_path": null,
  "output_dir": null,
  "experiment_name": "large-moe-1.3b_lr5e-05",
  "device": "cuda",
  "dtype": "bfloat16",
  "gradient_checkpointing": true,
  "architecture_name": "large-moe-1.3b_lr5e-05",
  "performance_estimate": {
    "total_params": 1083213824,
    "active_params": 781223936,
    "scale": "large",
    "platform": "Datacenter GPU (A100 80GB)",
    "throughput_tps": 1011.4705045314754,
    "ttft_ms": 16.24820736,
    "memory_gb": 2.0779457092285156,
    "meets_throughput_target": true,
    "meets_ttft_target": true,
    "meets_memory_target": true,
    "gpu_a100_tps": 1011.4705045314754,
    "gpu_h100_tps": 1820.6469081566559,
    "gpu_4090_tps": 404.58820181259017,
    "mobile_q8_tps": null
  }
}