c4_owt_owt50k_2026-06-04_15-00-22_019643-owt

Trained with nanochat. Checkpoint at step 17,196.

W&B run: https://wandb.ai/alexksternteam/nca-repro/runs/xsgly3mc

Outcome

metric	value
`step`	17196
`smooth_train_loss`	2.8100271224975586
`min_objective`	0.9576489225100749
`flops_used`	2.774395604019708e+19
`flops_per_token`	3077308480.0
`total_training_time`	19525.979648590088

Training config

{
  "model_pt": {
    "sequence_len": 1024,
    "vocab_size": 50304,
    "n_layer": 24,
    "n_head": 16,
    "n_embd": 1024,
    "intermediate_size": 4096,
    "rope_theta": 10000.0,
    "attention_dropout": 0.1,
    "weight_tying": false,
    "head_bias": false,
    "initializer_range": 0.02,
    "attn_implementation": "sdpa"
  },
  "model_ppt": null,
  "optim": {
    "matrix_lr": 0.0005,
    "embedding_lr": 0.0005,
    "unembedding_lr": 0.0005,
    "weight_decay": 0.0001
  },
  "train": {
    "num_iterations": 1000,
    "eval_every_n_steps": 2000,
    "save_every_n_steps": null,
    "log_every_n_steps": 10,
    "eval_at_end": true,
    "save_at_end": true,
    "grad_clip": 1.0,
    "ema_beta": 0.0
  },
  "eval": {
    "eval_steps": 4,
    "eval_tokens": 2016000
  },
  "hardware": {
    "device_batch_size": 128,
    "grad_accum_steps": 1,
    "peak_tflops": 2250.0
  },
  "wandb": {
    "enabled": true,
    "notes": "",
    "group": "repro_c4_owt",
    "tags": [
      "hf_llama",
      "vocab_64000",
      "pt_owt-gpt2bpe-9B",
      "seed_0",
      "ppt_c4-gpt2bpe-10B",
      "ppt_lr_cosine",
      "reinit"
    ],
    "entity": null
  },
  "data_pt": "owt-gpt2bpe-9B",
  "data_ppt": "c4-gpt2bpe-10B",
  "extra_eval": [
    "fineweb-gpt2bpe-20B",
    "nca-paper-1024"
  ],
  "train_split": "train",
  "pad_vocab": 50304,
  "lr_kind": "cosine",
  "lr_warmup_ratio": 0.1,
  "lr_warmdown_ratio": 0.0,
  "lr_final_frac": 0.0,
  "ppt_lr": 0.0001,
  "ppt_weight_decay": 0.0,
  "ppt_grad_clip": 0.0,
  "ppt_device_batch_size": 8,
  "ppt_grad_accum_steps": 1,
  "ppt_lr_kind": "cosine",
  "ppt_lr_warmup_ratio": 0.1,
  "ppt_lr_warmdown_ratio": 0.0,
  "ppt_tokens": 164160000,
  "pt_tokens": null,
  "eval_tokens": 2016000,
  "reinit_embed_at_transition": true,
  "reset_optimizer_at_transition": true,
  "depth": null,
  "compile_model": true,
  "model_project": "nca-repro",
  "run_name": "c4_owt_owt50k",
  "target_flops": 0.0,
  "alpha_ppt": 0.0,
  "use_measured_flops": true,
  "seed": 0,
  "push_to_hf": true,
  "hf_repo_org": "alexkstern"
}

Files

model_017196.pt — model weights (state_dict).
meta_017196.json — training metadata.
config_017196.json — run config snapshot.
rng_017196.pt — RNG state (when present).

Downloads last month: -; Downloads are not tracked for this model. How to track

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support