anthonym21 commited on
Commit
07f331b
·
verified ·
1 Parent(s): 0dfcaa3

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. config.json +1 -0
  2. make_nb.py +151 -0
  3. modeling_eve.py +24 -2
  4. push_to_hub.py +17 -0
config.json CHANGED
@@ -20,6 +20,7 @@
20
  "router_aux_loss_coef": 0.01,
21
  "shared_expert_intermediate_size": 1408,
22
  "top_k": 2,
 
23
  "transformers_version": "5.1.0",
24
  "use_cache": false,
25
  "use_checkpointing": false,
 
20
  "router_aux_loss_coef": 0.01,
21
  "shared_expert_intermediate_size": 1408,
22
  "top_k": 2,
23
+ "torch_dtype": "bfloat16",
24
  "transformers_version": "5.1.0",
25
  "use_cache": false,
26
  "use_checkpointing": false,
make_nb.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ def nb_cell_markdown(text: str):
5
+ return {"cell_type": "markdown", "metadata": {}, "source": [line + "\n" for line in text.splitlines()]}
6
+
7
+ def nb_cell_code(code: str):
8
+ return {"cell_type": "code", "metadata": {}, "execution_count": None, "outputs": [], "source": [line + "\n" for line in code.splitlines()]}
9
+
10
+ nb = {
11
+ "cells": [],
12
+ "metadata": {
13
+ "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
14
+ "language_info": {"name": "python", "version": "3.11"},
15
+ },
16
+ "nbformat": 4,
17
+ "nbformat_minor": 5,
18
+ }
19
+
20
+ nb["cells"].append(nb_cell_markdown(
21
+ """# Eve Swarm Trainer (Fixed)
22
+
23
+ Plain-text SFT (no chat templates) for training LoRA adapters on top of `anthonym21/Eve-2-MoE-IT-272M`.
24
+
25
+ Key rule: dataset must end up with a single `text` column so TRL uses `dataset_text_field="text"` and never calls `apply_chat_template()`.
26
+ """
27
+ ))
28
+
29
+ nb["cells"].append(nb_cell_code(
30
+ r"""# 1) Setup
31
+ # Avoid reinstalling torch on GPU images. Install only what you need.
32
+ !python -m pip install -q --upgrade "peft" "trl" "datasets" "huggingface_hub"
33
+
34
+ import torch
35
+ from datasets import load_dataset
36
+ from transformers import AutoModelForCausalLM, AutoTokenizer
37
+ from peft import LoraConfig, TaskType
38
+ from trl import SFTTrainer, SFTConfig
39
+ from huggingface_hub import notebook_login
40
+
41
+ notebook_login()
42
+ print("torch:", torch.__version__, "cuda:", torch.cuda.is_available())
43
+ """
44
+ ))
45
+
46
+ nb["cells"].append(nb_cell_code(
47
+ r"""# 2) Config
48
+ BASE_MODEL_ID = "anthonym21/Eve-2-MoE-IT-272M"
49
+ HF_USERNAME = "anthonym21"
50
+
51
+ SFT_ARGS = SFTConfig(
52
+ output_dir="./outputs",
53
+ per_device_train_batch_size=64,
54
+ gradient_accumulation_steps=1,
55
+ warmup_steps=50,
56
+ max_steps=500,
57
+ learning_rate=2e-4,
58
+ bf16=True,
59
+ logging_steps=10,
60
+ save_strategy="no",
61
+ report_to="none",
62
+ dataset_text_field="text", # forces plain-text path
63
+ max_seq_length=512,
64
+ packing=False,
65
+ )
66
+
67
+ LORA_CONFIG = LoraConfig(
68
+ r=16,
69
+ lora_alpha=32,
70
+ lora_dropout=0.05,
71
+ target_modules=["c_attn", "c_proj", "w1", "w2", "router"],
72
+ bias="none",
73
+ task_type=TaskType.CAUSAL_LM,
74
+ )
75
+
76
+ print("BASE_MODEL_ID:", BASE_MODEL_ID)
77
+ """
78
+ ))
79
+
80
+ nb["cells"].append(nb_cell_code(
81
+ r"""# 3) Load tokenizer once
82
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
83
+ tokenizer.pad_token = tokenizer.eos_token
84
+ print("pad_token:", tokenizer.pad_token, "eos_token:", tokenizer.eos_token)
85
+ """
86
+ ))
87
+
88
+ nb["cells"].append(nb_cell_code(
89
+ r"""# 4) Dataset formatter: Arun63/sharegpt-structured-output-json
90
+ # Example schema:
91
+ # sample["conversations"] = [{"from":"system","value":...},{"from":"human","value":...},{"from":"gpt","value":...}]
92
+
93
+ def format_arun63_json(sample):
94
+ convs = sample.get("conversations") or []
95
+ human = next((c.get("value", "") for c in convs if c.get("from") == "human"), "")
96
+ gpt = next((c.get("value", "") for c in convs if c.get("from") == "gpt"), "")
97
+ if not human or not gpt:
98
+ return {"text": None}
99
+
100
+ # Your model format: User/Assistant (NOT ChatML)
101
+ text = f"User: {human}\nAssistant: {gpt}{tokenizer.eos_token}"
102
+ return {"text": text}
103
+
104
+ def load_json_writer_ds(split="train"):
105
+ ds = load_dataset("Arun63/sharegpt-structured-output-json", split=split)
106
+
107
+ # remove all columns, keep only text
108
+ ds = ds.map(format_arun63_json, remove_columns=ds.column_names)
109
+
110
+ # filter empties
111
+ ds = ds.filter(lambda x: x["text"] is not None and len(x["text"]) > 0)
112
+ return ds
113
+
114
+ ds = load_json_writer_ds()
115
+ print("Training set size:", len(ds))
116
+ print("Sample:\n", ds[0]["text"][:400])
117
+ """
118
+ ))
119
+
120
+ nb["cells"].append(nb_cell_code(
121
+ r"""# 5) Train: json_writer -> push as Eve-JSON-272M
122
+ model = AutoModelForCausalLM.from_pretrained(
123
+ BASE_MODEL_ID,
124
+ trust_remote_code=True,
125
+ torch_dtype=torch.bfloat16,
126
+ device_map="auto",
127
+ )
128
+
129
+ # sanity: ensure target modules exist
130
+ targets = {"c_attn","c_proj","w1","w2","router"}
131
+ matches = [n for n,_ in model.named_modules() if n.split(".")[-1] in targets]
132
+ print("[Sanity Check] Found", len(matches), "target modules.")
133
+
134
+ trainer = SFTTrainer(
135
+ model=model,
136
+ args=SFT_ARGS,
137
+ train_dataset=ds,
138
+ peft_config=LORA_CONFIG,
139
+ )
140
+
141
+ trainer.train()
142
+
143
+ repo_id = f"{HF_USERNAME}/Eve-JSON-272M"
144
+ trainer.model.push_to_hub(repo_id)
145
+ tokenizer.push_to_hub(repo_id)
146
+ print("Pushed:", repo_id)
147
+ """
148
+ ))
149
+
150
+ Path("swarm_trainer_fixed.ipynb").write_text(json.dumps(nb, indent=2), encoding="utf-8")
151
+ print("Wrote swarm_trainer_fixed.ipynb")
modeling_eve.py CHANGED
@@ -94,6 +94,8 @@ class SharedMoE(nn.Module):
94
 
95
  def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
96
  B, T, C = x.shape
 
 
97
 
98
  shared_out = self.shared_expert(x)
99
 
@@ -194,6 +196,18 @@ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
194
  # Initialize weights and apply final processing
195
  self.post_init()
196
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  # --- PEFT / HF compatibility hooks ---
198
  def get_input_embeddings(self) -> nn.Module:
199
  return self.transformer.wte
@@ -212,6 +226,7 @@ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
212
  self,
213
  input_ids: Optional[torch.LongTensor] = None,
214
  idx: Optional[torch.LongTensor] = None,
 
215
  labels: Optional[torch.LongTensor] = None,
216
  targets: Optional[torch.LongTensor] = None,
217
  **kwargs: Any,
@@ -252,11 +267,13 @@ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
252
  shift_labels = targets[:, 1:].contiguous()
253
 
254
  loss = F.cross_entropy(
255
- shift_logits.float().view(-1, shift_logits.size(-1)),
256
  shift_labels.view(-1),
257
  ignore_index=-100,
258
  )
259
 
 
 
260
  if total_aux_loss is not None and self.config.router_aux_loss_coef:
261
  loss = loss + (self.config.router_aux_loss_coef * total_aux_loss)
262
 
@@ -269,4 +286,9 @@ class DeepSeekMoE(PreTrainedModel, GenerationMixin):
269
  # --- Generation ---
270
  def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs: Any) -> Dict[str, Any]:
271
  # No kv-cache support; always feed full sequence.
272
- return {"input_ids": input_ids}
 
 
 
 
 
 
94
 
95
  def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
96
  B, T, C = x.shape
97
+ if self.top_k < 1 or self.top_k > self.config.num_experts:
98
+ raise ValueError(f"Invalid MoE top_k={self.top_k}; must be in [1, {self.config.num_experts}]")
99
 
100
  shared_out = self.shared_expert(x)
101
 
 
196
  # Initialize weights and apply final processing
197
  self.post_init()
198
 
199
+ # Harden generation_config to avoid invalid configs blocking save_pretrained()
200
+ if hasattr(self, "generation_config") and self.generation_config is not None:
201
+ g = self.generation_config
202
+ # If not sampling, sampling-only knobs must be neutral.
203
+ if not getattr(g, "do_sample", False):
204
+ if getattr(g, "top_k", 0):
205
+ g.top_k = 0
206
+ if getattr(g, "top_p", 1.0) != 1.0:
207
+ g.top_p = 1.0
208
+ if getattr(g, "temperature", 1.0) != 1.0:
209
+ g.temperature = 1.0
210
+
211
  # --- PEFT / HF compatibility hooks ---
212
  def get_input_embeddings(self) -> nn.Module:
213
  return self.transformer.wte
 
226
  self,
227
  input_ids: Optional[torch.LongTensor] = None,
228
  idx: Optional[torch.LongTensor] = None,
229
+ attention_mask: Optional[torch.Tensor] = None, # accept + ignore
230
  labels: Optional[torch.LongTensor] = None,
231
  targets: Optional[torch.LongTensor] = None,
232
  **kwargs: Any,
 
267
  shift_labels = targets[:, 1:].contiguous()
268
 
269
  loss = F.cross_entropy(
270
+ shift_logits.view(-1, shift_logits.size(-1)).to(torch.float32),
271
  shift_labels.view(-1),
272
  ignore_index=-100,
273
  )
274
 
275
+
276
+
277
  if total_aux_loss is not None and self.config.router_aux_loss_coef:
278
  loss = loss + (self.config.router_aux_loss_coef * total_aux_loss)
279
 
 
286
  # --- Generation ---
287
  def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs: Any) -> Dict[str, Any]:
288
  # No kv-cache support; always feed full sequence.
289
+ out = {"input_ids": input_ids}
290
+ # HF generate() may pass attention_mask; accept it even if we don't apply it.
291
+ if "attention_mask" in kwargs and kwargs["attention_mask"] is not None:
292
+ out["attention_mask"] = kwargs["attention_mask"]
293
+ return out
294
+
push_to_hub.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+
3
+ api = HfApi()
4
+
5
+ repo_id = "anthonym21/Eve-2-MoE-IT-272M"
6
+ folder_path = "."
7
+
8
+ print(f"Uploading {folder_path} to {repo_id}...")
9
+
10
+ api.upload_folder(
11
+ folder_path=folder_path,
12
+ repo_id=repo_id,
13
+ repo_type="model",
14
+ ignore_patterns=[".git", ".cache", "__pycache__", "*.ipynb", "*.lock", ".DS_Store"],
15
+ )
16
+
17
+ print("Upload complete! You can now reload the model in your notebook.")