Eve-2-MoE-IT-272M / configuration_eve.py
anthonym21's picture
Add configuration_eve.py from base model
3bcd5f6 verified
"""
Eve-2-MoE Configuration
========================
HuggingFace-compatible configuration for the Eve-2-MoE architecture.
Usage:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("anthonym21/Eve-2-MoE-272M", trust_remote_code=True)
"""
from transformers import PretrainedConfig
class EveConfig(PretrainedConfig):
"""Configuration for the Eve-2-MoE model.
This is a DeepSeek-V3 style Mixture of Experts architecture with a shared
expert, top-k routed experts, RoPE positional encoding, and SwiGLU activations.
Args:
vocab_size: Vocabulary size (padded for efficiency). Default: 50304.
n_layer: Number of transformer blocks. Default: 12.
n_embd: Hidden dimension / embedding size. Default: 512.
n_head: Number of attention heads. Default: 8.
head_dim: Dimension per attention head. Default: 64.
block_size: Maximum sequence length (context window). Default: 2048.
num_experts: Number of routed MoE experts. Default: 8.
top_k: Number of experts activated per token. Default: 2.
expert_intermediate_size: FFN hidden dim for each expert (SwiGLU). Default: 1408.
shared_expert_intermediate_size: FFN hidden dim for the shared expert. Default: 1408.
router_aux_loss_coef: Weight of the load-balancing auxiliary loss. Default: 0.01.
rope_theta: Base frequency for RoPE. Default: 10000.0.
use_checkpointing: Enable gradient checkpointing to save VRAM. Default: False.
"""
model_type = "eve-moe"
def __init__(
self,
vocab_size: int = 50304,
n_layer: int = 12,
n_embd: int = 512,
n_head: int = 8,
head_dim: int = 64,
block_size: int = 2048,
num_experts: int = 8,
top_k: int = 2,
expert_intermediate_size: int = 1408,
shared_expert_intermediate_size: int = 1408,
router_aux_loss_coef: float = 0.01,
rope_theta: float = 10000.0,
use_checkpointing: bool = False,
**kwargs,
):
self.vocab_size = vocab_size
self.n_layer = n_layer
self.n_embd = n_embd
self.n_head = n_head
self.head_dim = head_dim
self.block_size = block_size
self.num_experts = num_experts
self.top_k = top_k
self.expert_intermediate_size = expert_intermediate_size
self.shared_expert_intermediate_size = shared_expert_intermediate_size
self.router_aux_loss_coef = router_aux_loss_coef
self.rope_theta = rope_theta
self.use_checkpointing = use_checkpointing
# Default tie_word_embeddings to True (Eve-2 ties embedding + lm_head)
kwargs.setdefault("tie_word_embeddings", True)
super().__init__(**kwargs)