c4_owt_2026-05-27_11-25-16_133037-owt

Trained with nanochat. Checkpoint at step 17,196.

W&B run: https://wandb.ai/alexksternteam/nca-repro/runs/a4mqam5g

Outcome

metric value
step 17196
smooth_train_loss 2.862068612192211
min_objective 0.9605072648678921
flops_used 2.8502607518490427e+19
flops_per_token 3161456704.0
total_training_time 29892.98519206047

Training config

{
  "model_pt": {
    "sequence_len": 1024,
    "vocab_size": 64000,
    "n_layer": 24,
    "n_head": 16,
    "n_embd": 1024,
    "intermediate_size": 4096,
    "rope_theta": 10000.0,
    "attention_dropout": 0.1,
    "weight_tying": true,
    "head_bias": true,
    "initializer_range": 0.02
  },
  "model_ppt": null,
  "optim": {
    "matrix_lr": 0.0005,
    "embedding_lr": 0.0005,
    "unembedding_lr": 0.0005,
    "weight_decay": 0.0001
  },
  "train": {
    "num_iterations": 1000,
    "eval_every_n_steps": 1000,
    "save_every_n_steps": null,
    "log_every_n_steps": 1,
    "eval_at_end": true,
    "save_at_end": true,
    "grad_clip": 1.0
  },
  "eval": {
    "eval_steps": 8,
    "eval_tokens": 2016000
  },
  "hardware": {
    "device_batch_size": 64,
    "grad_accum_steps": 2,
    "peak_tflops": 2250.0
  },
  "wandb": {
    "enabled": true,
    "notes": "",
    "group": "repro_c4_owt",
    "tags": [
      "hf_llama",
      "vocab_64000",
      "pt_owt-gpt2bpe-9B",
      "seed_42",
      "ppt_c4-gpt2bpe-10B",
      "ppt_lr_cosine",
      "reinit"
    ],
    "entity": null
  },
  "data_pt": "owt-gpt2bpe-9B",
  "data_ppt": "c4-gpt2bpe-10B",
  "extra_eval": [
    "fineweb-gpt2bpe-20B",
    "nca-paper-1024"
  ],
  "train_split": "train",
  "pad_vocab": 64000,
  "lr_kind": "cosine",
  "lr_warmup_ratio": 0.1,
  "lr_warmdown_ratio": 0.0,
  "lr_final_frac": 0.0,
  "ppt_lr": 0.0001,
  "ppt_weight_decay": 0.0,
  "ppt_grad_clip": 0.0,
  "ppt_device_batch_size": 8,
  "ppt_grad_accum_steps": 1,
  "ppt_lr_kind": "cosine",
  "ppt_lr_warmup_ratio": 0.1,
  "ppt_lr_warmdown_ratio": 0.0,
  "ppt_tokens": 164000000,
  "pt_tokens": null,
  "eval_tokens": 2016000,
  "reinit_embed_at_transition": true,
  "reset_optimizer_at_transition": true,
  "depth": null,
  "compile_model": false,
  "model_project": "nca-repro",
  "run_name": "c4_owt",
  "target_flops": 0.0,
  "alpha_ppt": 0.0,
  "use_measured_flops": true,
  "seed": 42,
  "push_to_hf": true,
  "hf_repo_org": "alexkstern"
}

Files

  • model_017196.pt โ€” model weights (state_dict).
  • meta_017196.json โ€” training metadata.
  • config_017196.json โ€” run config snapshot.
  • rng_017196.pt โ€” RNG state (when present).
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support