| """ |
| Eve-2-MoE Configuration |
| ======================== |
| HuggingFace-compatible configuration for the Eve-2-MoE architecture. |
| |
| Usage: |
| from transformers import AutoConfig |
| config = AutoConfig.from_pretrained("anthonym21/Eve-2-MoE-272M", trust_remote_code=True) |
| """ |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class EveConfig(PretrainedConfig): |
| """Configuration for the Eve-2-MoE model. |
| |
| This is a DeepSeek-V3 style Mixture of Experts architecture with a shared |
| expert, top-k routed experts, RoPE positional encoding, and SwiGLU activations. |
| |
| Args: |
| vocab_size: Vocabulary size (padded for efficiency). Default: 50304. |
| n_layer: Number of transformer blocks. Default: 12. |
| n_embd: Hidden dimension / embedding size. Default: 512. |
| n_head: Number of attention heads. Default: 8. |
| head_dim: Dimension per attention head. Default: 64. |
| block_size: Maximum sequence length (context window). Default: 2048. |
| num_experts: Number of routed MoE experts. Default: 8. |
| top_k: Number of experts activated per token. Default: 2. |
| expert_intermediate_size: FFN hidden dim for each expert (SwiGLU). Default: 1408. |
| shared_expert_intermediate_size: FFN hidden dim for the shared expert. Default: 1408. |
| router_aux_loss_coef: Weight of the load-balancing auxiliary loss. Default: 0.01. |
| rope_theta: Base frequency for RoPE. Default: 10000.0. |
| use_checkpointing: Enable gradient checkpointing to save VRAM. Default: False. |
| """ |
|
|
| model_type = "eve-moe" |
|
|
| def __init__( |
| self, |
| vocab_size: int = 50304, |
| n_layer: int = 12, |
| n_embd: int = 512, |
| n_head: int = 8, |
| head_dim: int = 64, |
| block_size: int = 2048, |
| num_experts: int = 8, |
| top_k: int = 2, |
| expert_intermediate_size: int = 1408, |
| shared_expert_intermediate_size: int = 1408, |
| router_aux_loss_coef: float = 0.01, |
| rope_theta: float = 10000.0, |
| use_checkpointing: bool = False, |
| **kwargs, |
| ): |
| self.vocab_size = vocab_size |
| self.n_layer = n_layer |
| self.n_embd = n_embd |
| self.n_head = n_head |
| self.head_dim = head_dim |
| self.block_size = block_size |
| self.num_experts = num_experts |
| self.top_k = top_k |
| self.expert_intermediate_size = expert_intermediate_size |
| self.shared_expert_intermediate_size = shared_expert_intermediate_size |
| self.router_aux_loss_coef = router_aux_loss_coef |
| self.rope_theta = rope_theta |
| self.use_checkpointing = use_checkpointing |
|
|
| |
| kwargs.setdefault("tie_word_embeddings", True) |
|
|
| super().__init__(**kwargs) |
|
|