anthonym21
/

Eve-2-MoE-IT-272M

@@ -20,6 +20,7 @@
   "router_aux_loss_coef": 0.01,
   "shared_expert_intermediate_size": 1408,
   "top_k": 2,
   "transformers_version": "5.1.0",
   "use_cache": false,
   "use_checkpointing": false,

   "router_aux_loss_coef": 0.01,
   "shared_expert_intermediate_size": 1408,
   "top_k": 2,
+  "torch_dtype": "bfloat16",
   "transformers_version": "5.1.0",
   "use_cache": false,
   "use_checkpointing": false,

make_nb.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import json
+from pathlib import Path
+def nb_cell_markdown(text: str):
+    return {"cell_type": "markdown", "metadata": {}, "source": [line + "\n" for line in text.splitlines()]}
+def nb_cell_code(code: str):
+    return {"cell_type": "code", "metadata": {}, "execution_count": None, "outputs": [], "source": [line + "\n" for line in code.splitlines()]}
+nb = {
+  "cells": [],
+  "metadata": {
+    "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
+    "language_info": {"name": "python", "version": "3.11"},
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5,
+}
+nb["cells"].append(nb_cell_markdown(
+"""# Eve Swarm Trainer (Fixed)
+Plain-text SFT (no chat templates) for training LoRA adapters on top of `anthonym21/Eve-2-MoE-IT-272M`.
+Key rule: dataset must end up with a single `text` column so TRL uses `dataset_text_field="text"` and never calls `apply_chat_template()`.
+"""
+))
+nb["cells"].append(nb_cell_code(
+r"""# 1) Setup
+# Avoid reinstalling torch on GPU images. Install only what you need.
+!python -m pip install -q --upgrade "peft" "trl" "datasets" "huggingface_hub"
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import LoraConfig, TaskType
+from trl import SFTTrainer, SFTConfig
+from huggingface_hub import notebook_login
+notebook_login()
+print("torch:", torch.__version__, "cuda:", torch.cuda.is_available())
+"""
+))
+nb["cells"].append(nb_cell_code(
+r"""# 2) Config
+BASE_MODEL_ID = "anthonym21/Eve-2-MoE-IT-272M"
+HF_USERNAME = "anthonym21"
+SFT_ARGS = SFTConfig(
+    output_dir="./outputs",
+    per_device_train_batch_size=64,
+    gradient_accumulation_steps=1,
+    warmup_steps=50,
+    max_steps=500,
+    learning_rate=2e-4,
+    bf16=True,
+    logging_steps=10,
+    save_strategy="no",
+    report_to="none",
+    dataset_text_field="text",   # forces plain-text path
+    max_seq_length=512,
+    packing=False,
+)
+LORA_CONFIG = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    target_modules=["c_attn", "c_proj", "w1", "w2", "router"],
+    bias="none",
+    task_type=TaskType.CAUSAL_LM,
+)
+print("BASE_MODEL_ID:", BASE_MODEL_ID)
+"""
+))
+nb["cells"].append(nb_cell_code(
+r"""# 3) Load tokenizer once
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+print("pad_token:", tokenizer.pad_token, "eos_token:", tokenizer.eos_token)
+"""
+))
+nb["cells"].append(nb_cell_code(
+r"""# 4) Dataset formatter: Arun63/sharegpt-structured-output-json
+# Example schema:
+# sample["conversations"] = [{"from":"system","value":...},{"from":"human","value":...},{"from":"gpt","value":...}]
+def format_arun63_json(sample):
+    convs = sample.get("conversations") or []
+    human = next((c.get("value", "") for c in convs if c.get("from") == "human"), "")
+    gpt   = next((c.get("value", "") for c in convs if c.get("from") == "gpt"), "")
+    if not human or not gpt:
+        return {"text": None}
+    # Your model format: User/Assistant (NOT ChatML)
+    text = f"User: {human}\nAssistant: {gpt}{tokenizer.eos_token}"
+    return {"text": text}
+def load_json_writer_ds(split="train"):
+    ds = load_dataset("Arun63/sharegpt-structured-output-json", split=split)
+    # remove all columns, keep only text
+    ds = ds.map(format_arun63_json, remove_columns=ds.column_names)
+    # filter empties
+    ds = ds.filter(lambda x: x["text"] is not None and len(x["text"]) > 0)
+    return ds
+ds = load_json_writer_ds()
+print("Training set size:", len(ds))
+print("Sample:\n", ds[0]["text"][:400])
+"""
+))
+nb["cells"].append(nb_cell_code(
+r"""# 5) Train: json_writer -> push as Eve-JSON-272M
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+# sanity: ensure target modules exist
+targets = {"c_attn","c_proj","w1","w2","router"}
+matches = [n for n,_ in model.named_modules() if n.split(".")[-1] in targets]
+print("[Sanity Check] Found", len(matches), "target modules.")
+trainer = SFTTrainer(
+    model=model,
+    args=SFT_ARGS,
+    train_dataset=ds,
+    peft_config=LORA_CONFIG,
+)
+trainer.train()
+repo_id = f"{HF_USERNAME}/Eve-JSON-272M"
+trainer.model.push_to_hub(repo_id)
+tokenizer.push_to_hub(repo_id)
+print("Pushed:", repo_id)
+"""
+))
+Path("swarm_trainer_fixed.ipynb").write_text(json.dumps(nb, indent=2), encoding="utf-8")
+print("Wrote swarm_trainer_fixed.ipynb")

modeling_eve.py CHANGED Viewed

@@ -94,6 +94,8 @@ class SharedMoE(nn.Module):
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         B, T, C = x.shape
         shared_out = self.shared_expert(x)
@@ -194,6 +196,18 @@ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
         # Initialize weights and apply final processing
         self.post_init()
     # --- PEFT / HF compatibility hooks ---
     def get_input_embeddings(self) -> nn.Module:
         return self.transformer.wte
@@ -212,6 +226,7 @@ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
         self,
         input_ids: Optional[torch.LongTensor] = None,
         idx: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         targets: Optional[torch.LongTensor] = None,
         **kwargs: Any,
@@ -252,11 +267,13 @@ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
                 shift_labels = targets[:, 1:].contiguous()
             loss = F.cross_entropy(
-                shift_logits.float().view(-1, shift_logits.size(-1)),
                 shift_labels.view(-1),
                 ignore_index=-100,
             )
             if total_aux_loss is not None and self.config.router_aux_loss_coef:
                 loss = loss + (self.config.router_aux_loss_coef * total_aux_loss)
@@ -269,4 +286,9 @@ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
     # --- Generation ---
     def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs: Any) -> Dict[str, Any]:
         # No kv-cache support; always feed full sequence.
-        return {"input_ids": input_ids}

     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         B, T, C = x.shape
+        if self.top_k < 1 or self.top_k > self.config.num_experts:
+            raise ValueError(f"Invalid MoE top_k={self.top_k}; must be in [1, {self.config.num_experts}]")
         shared_out = self.shared_expert(x)
         # Initialize weights and apply final processing
         self.post_init()
+        # Harden generation_config to avoid invalid configs blocking save_pretrained()
+        if hasattr(self, "generation_config") and self.generation_config is not None:
+            g = self.generation_config
+            # If not sampling, sampling-only knobs must be neutral.
+            if not getattr(g, "do_sample", False):
+                if getattr(g, "top_k", 0):
+                    g.top_k = 0
+                if getattr(g, "top_p", 1.0) != 1.0:
+                    g.top_p = 1.0
+                if getattr(g, "temperature", 1.0) != 1.0:
+                    g.temperature = 1.0
     # --- PEFT / HF compatibility hooks ---
     def get_input_embeddings(self) -> nn.Module:
         return self.transformer.wte
         self,
         input_ids: Optional[torch.LongTensor] = None,
         idx: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,  # accept + ignore
         labels: Optional[torch.LongTensor] = None,
         targets: Optional[torch.LongTensor] = None,
         **kwargs: Any,
                 shift_labels = targets[:, 1:].contiguous()
             loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)).to(torch.float32),
                 shift_labels.view(-1),
                 ignore_index=-100,
             )
             if total_aux_loss is not None and self.config.router_aux_loss_coef:
                 loss = loss + (self.config.router_aux_loss_coef * total_aux_loss)
     # --- Generation ---
     def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs: Any) -> Dict[str, Any]:
         # No kv-cache support; always feed full sequence.
+        out = {"input_ids": input_ids}
+        # HF generate() may pass attention_mask; accept it even if we don't apply it.
+        if "attention_mask" in kwargs and kwargs["attention_mask"] is not None:
+            out["attention_mask"] = kwargs["attention_mask"]
+        return out

push_to_hub.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from huggingface_hub import HfApi
+api = HfApi()
+repo_id = "anthonym21/Eve-2-MoE-IT-272M"
+folder_path = "."
+print(f"Uploading {folder_path} to {repo_id}...")
+api.upload_folder(
+    folder_path=folder_path,
+    repo_id=repo_id,
+    repo_type="model",
+    ignore_patterns=[".git", ".cache", "__pycache__", "*.ipynb", "*.lock", ".DS_Store"],
+)
+print("Upload complete! You can now reload the model in your notebook.")