c4_owt_owt50k_2026-06-04_15-00-22_019643-owt
Trained with nanochat. Checkpoint at step 17,196.
W&B run: https://wandb.ai/alexksternteam/nca-repro/runs/xsgly3mc
Outcome
| metric | value |
|---|---|
step |
17196 |
smooth_train_loss |
2.8100271224975586 |
min_objective |
0.9576489225100749 |
flops_used |
2.774395604019708e+19 |
flops_per_token |
3077308480.0 |
total_training_time |
19525.979648590088 |
Training config
{
"model_pt": {
"sequence_len": 1024,
"vocab_size": 50304,
"n_layer": 24,
"n_head": 16,
"n_embd": 1024,
"intermediate_size": 4096,
"rope_theta": 10000.0,
"attention_dropout": 0.1,
"weight_tying": false,
"head_bias": false,
"initializer_range": 0.02,
"attn_implementation": "sdpa"
},
"model_ppt": null,
"optim": {
"matrix_lr": 0.0005,
"embedding_lr": 0.0005,
"unembedding_lr": 0.0005,
"weight_decay": 0.0001
},
"train": {
"num_iterations": 1000,
"eval_every_n_steps": 2000,
"save_every_n_steps": null,
"log_every_n_steps": 10,
"eval_at_end": true,
"save_at_end": true,
"grad_clip": 1.0,
"ema_beta": 0.0
},
"eval": {
"eval_steps": 4,
"eval_tokens": 2016000
},
"hardware": {
"device_batch_size": 128,
"grad_accum_steps": 1,
"peak_tflops": 2250.0
},
"wandb": {
"enabled": true,
"notes": "",
"group": "repro_c4_owt",
"tags": [
"hf_llama",
"vocab_64000",
"pt_owt-gpt2bpe-9B",
"seed_0",
"ppt_c4-gpt2bpe-10B",
"ppt_lr_cosine",
"reinit"
],
"entity": null
},
"data_pt": "owt-gpt2bpe-9B",
"data_ppt": "c4-gpt2bpe-10B",
"extra_eval": [
"fineweb-gpt2bpe-20B",
"nca-paper-1024"
],
"train_split": "train",
"pad_vocab": 50304,
"lr_kind": "cosine",
"lr_warmup_ratio": 0.1,
"lr_warmdown_ratio": 0.0,
"lr_final_frac": 0.0,
"ppt_lr": 0.0001,
"ppt_weight_decay": 0.0,
"ppt_grad_clip": 0.0,
"ppt_device_batch_size": 8,
"ppt_grad_accum_steps": 1,
"ppt_lr_kind": "cosine",
"ppt_lr_warmup_ratio": 0.1,
"ppt_lr_warmdown_ratio": 0.0,
"ppt_tokens": 164160000,
"pt_tokens": null,
"eval_tokens": 2016000,
"reinit_embed_at_transition": true,
"reset_optimizer_at_transition": true,
"depth": null,
"compile_model": true,
"model_project": "nca-repro",
"run_name": "c4_owt_owt50k",
"target_flops": 0.0,
"alpha_ppt": 0.0,
"use_measured_flops": true,
"seed": 0,
"push_to_hf": true,
"hf_repo_org": "alexkstern"
}
Files
model_017196.ptโ model weights (state_dict).meta_017196.jsonโ training metadata.config_017196.jsonโ run config snapshot.rng_017196.ptโ RNG state (when present).
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐ Ask for provider support