Upload bs7 best checkpoint step_9000

Browse files

Files changed (13) hide show

README.md +79 -0
best_validation.json +8 -0
eval_metrics.jsonl +10 -0
eval_summary.json +27 -0
metrics.jsonl +0 -0
probe_generations.jsonl +0 -0
probe_step9000_summary.json +38 -0
step_9000.pt +3 -0
step_9000.safetensors +3 -0
step_9000.safetensors.json +287 -0
tokenizer.json +0 -0
tokenizer_meta.json +10 -0
training_config.yaml +48 -0

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+---
+language:
+- en
+- it
+license: other
+library_name: custom
+pipeline_tag: text-generation
+tags:
+- nanochat
+- gpt2-small
+- bilingual
+- english
+- italian
+- pretraining
+---
+# gpt2small-en-it-nanochat-lr2e4-batchmaxpossible-bs7-step9000
+This repo stages the best saved checkpoint from the local NanoChat EN/IT GPT-2-small-like run `stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7`.
+## What this is
+- model family: GPT-2-small-like decoder-only LM
+- parameters: ~136M
+- languages: English + Italian
+- context length: 2500
+- selected checkpoint: `step_9000.pt`
+- selection reason: lowest recorded validation loss among saved checkpoints in `best_validation.json`
+## Best validation
+- step: 9000
+- validation loss: 4.0797094479
+- validation perplexity: 59.1282875069
+- validation batches: 128
+## Important caveat
+A later checkpoint `step_10000.pt` exists, but it is worse on validation than `step_9000.pt`, so this release intentionally publishes `step_9000.pt` instead of the latest saved checkpoint.
+## Training/data provenance
+- training config: `training_config.yaml`
+- tokenizer: `tokenizer.json` + `tokenizer_meta.json`
+- packed dataset root used by the run: `/mnt/apps/llm-nanochat/datasets/202605011052_fresh_50_50_score100_2500_sourcebalanced`
+- tokenizer root used by the run: `/mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch`
+## Included files
+- `step_9000.pt`
+- `step_9000.safetensors`
+- `step_9000.safetensors.json`
+- `training_config.yaml`
+- `tokenizer.json`
+- `tokenizer_meta.json`
+- `best_validation.json`
+- `eval_summary.json`
+- `probe_step9000_summary.json`
+- full run telemetry snapshots: `eval_metrics.jsonl`, `metrics.jsonl`, `probe_generations.jsonl`
+## Probe reading at step 9000
+- EN factual prompt `The capital of Italy is -> Rome`: weak (`rank=248`)
+- EN simple continuation `A small language model should -> be`: strong (`rank=1`)
+- IT factual prompt `La capitale d'Italia è -> Roma`: weak (`rank=1103`)
+- IT simple continuation `Un piccolo modello linguistico dovrebbe -> essere`: strong (`rank=1`)
+So this checkpoint is useful as a real intermediate bilingual pretraining artifact, but it is not a polished factual model.
+## Usage
+This project uses a custom NanoChat inference/training stack. The easiest local UI in the source repo is the Chainlit checkpoint tester documented in the repo README.
+## Limitations
+- factual recall is still weak
+- generations can become repetitive
+- the model was selected by validation loss inside this run family, not by broad downstream benchmark performance
+- dataset redistribution for the full training corpus may have separate licensing constraints; this repo contains model artifacts, not the raw/prepared training corpus

best_validation.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "step": 9000,
+  "validation_loss": 4.079709447920322,
+  "validation_perplexity": 59.128287506917495,
+  "validation_num_batches": 128,
+  "elapsed_sec": 33191.382900476456,
+  "checkpoint_path": "/mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7/step_9000.pt"
+}

eval_metrics.jsonl ADDED Viewed

	@@ -0,0 +1,10 @@

+{"step": 1000, "validation_loss": 5.71584378182888, "validation_perplexity": 303.6403014175998, "validation_num_batches": 128, "elapsed_sec": 9672.132137775421}
+{"step": 2000, "validation_loss": 4.976522132754326, "validation_perplexity": 144.96931984461716, "validation_num_batches": 128, "elapsed_sec": 19343.931071043015}
+{"step": 3000, "validation_loss": 4.552704691886902, "validation_perplexity": 94.88870626918761, "validation_num_batches": 128, "elapsed_sec": 29024.834416627884}
+{"step": 4000, "validation_loss": 4.3076410908252, "validation_perplexity": 74.26509753553422, "validation_num_batches": 128, "elapsed_sec": 38696.663430929184}
+{"step": 5000, "validation_loss": 4.182185200974345, "validation_perplexity": 65.50884691823217, "validation_num_batches": 128, "elapsed_sec": 4152.805989980698}
+{"step": 6000, "validation_loss": 4.291273836046457, "validation_perplexity": 73.05947504235718, "validation_num_batches": 128, "elapsed_sec": 8294.450510978699}
+{"step": 7000, "validation_loss": 4.166212774813175, "validation_perplexity": 64.47082364114317, "validation_num_batches": 128, "elapsed_sec": 16590.2826256752}
+{"step": 8000, "validation_loss": 4.110390676185489, "validation_perplexity": 60.97053265245032, "validation_num_batches": 128, "elapsed_sec": 24895.48659992218}
+{"step": 9000, "validation_loss": 4.079709447920322, "validation_perplexity": 59.128287506917495, "validation_num_batches": 128, "elapsed_sec": 33191.382900476456}
+{"step": 10000, "validation_loss": 4.123492615297437, "validation_perplexity": 61.774620914104325, "validation_num_batches": 128, "elapsed_sec": 41492.283163785934}

eval_summary.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "model_name": "gpt2small-en-it-nanochat-lr2e4-batchmaxpossible-bs7-step9000",
+  "selected_checkpoint": "step_9000.pt",
+  "selection_reason": "best_validation.json minimum validation loss for this run",
+  "best_validation": {
+    "step": 9000,
+    "validation_loss": 4.079709447920322,
+    "validation_perplexity": 59.128287506917495,
+    "validation_num_batches": 128,
+    "elapsed_sec": 33191.382900476456,
+    "checkpoint_path": "/mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7/step_9000.pt"
+  },
+  "final_validation_step_10000": {
+    "step": 10000,
+    "validation_loss": 4.123492615297437,
+    "validation_perplexity": 61.774620914104325,
+    "validation_num_batches": 128,
+    "elapsed_sec": 41492.283163785934
+  },
+  "notes": [
+    "This is the best saved checkpoint of the stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7 run.",
+    "The later checkpoint step_10000.pt exists but is worse on validation than step_9000.pt.",
+    "Probe quality remains mixed: simple continuations are strong, factual recall remains weak and repetitive."
+  ],
+  "tokenizer_dir": "/mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch",
+  "dataset_dir": "/mnt/apps/llm-nanochat/datasets/202605011052_fresh_50_50_score100_2500_sourcebalanced"
+}

metrics.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

probe_generations.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

probe_step9000_summary.json ADDED Viewed

	@@ -0,0 +1,38 @@

+[
+  {
+    "language": "en",
+    "prompt": "The capital of Italy is",
+    "expected_next_text": " Rome",
+    "completion": " the capital of Italy. The capital of Italy is the capital of Italy. The capital of Italy is the capital of Italy. The capital of Italy is the capital",
+    "correct_token_rank": 248,
+    "correct_token_probability": 0.0003604888916015625,
+    "entropy": 5.0625
+  },
+  {
+    "language": "en",
+    "prompt": "A small language model should",
+    "expected_next_text": " be",
+    "completion": " be used to be used to be used to be used to be used to be used to be used to be used to be used to be used to be used",
+    "correct_token_rank": 1,
+    "correct_token_probability": 0.45703125,
+    "entropy": 3.703125
+  },
+  {
+    "language": "it",
+    "prompt": "La capitale d'Italia è",
+    "expected_next_text": " Roma",
+    "completion": " stata occupata da un'altra parte, e la sua posizione è stata di fatto. La sua posizione è stata di fatto, e la sua posizione è stata di",
+    "correct_token_rank": 1103,
+    "correct_token_probability": 4.6253204345703125e-05,
+    "entropy": 5.125
+  },
+  {
+    "language": "it",
+    "prompt": "Un piccolo modello linguistico dovrebbe",
+    "expected_next_text": " essere",
+    "completion": " essere un'opera di un'opera di un'opera di un'opera di un'opera di un'opera di un'opera di un'opera",
+    "correct_token_rank": 1,
+    "correct_token_probability": 0.3984375,
+    "entropy": 4.34375
+  }
+]

step_9000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0712992c4bfa86045f9a612d327c4624f83c6ae9058efe24f36a195e7766efb5
+size 1633717975

step_9000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5c93c3bf22b8102a97d074a5ca6394282b67a0f0c82bf745191d7e0f942f2ef
+size 544531056

step_9000.safetensors.json ADDED Viewed

	@@ -0,0 +1,287 @@

+{
+  "checkpoint_config": {
+    "actual_precision": "bf16",
+    "adamw_betas": [
+      0.9,
+      0.95
+    ],
+    "adamw_eps": 1e-08,
+    "attention_kernel_policy": "auto",
+    "batch_size": 6,
+    "benchmark": {
+      "enable_central_tensorboard": true,
+      "enable_local_tensorboard": true,
+      "enabled": false,
+      "output_path": "/mnt/apps/llm-nanochat/artifacts/runs/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7/throughput_benchmark.json",
+      "warmup_steps": 0
+    },
+    "checkpoint_dir": "/mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7",
+    "clip_grad_norm": 1.0,
+    "compile": {
+      "backend": null,
+      "compile_setup_sec": 0.0,
+      "diagnostic": null,
+      "dynamic": false,
+      "enabled": false,
+      "error_policy": "raise",
+      "fullgraph": false,
+      "mode": null,
+      "requested": false,
+      "status": "disabled"
+    },
+    "dataset": {
+      "storage_mode": "indexed_jsonl"
+    },
+    "decay_steps": 9850,
+    "deterministic_algorithms": false,
+    "device": "cuda",
+    "dim": 768,
+    "final_lr": 1e-05,
+    "fp8_backend": null,
+    "grad_accum_steps": 16,
+    "learning_rate": 0.0002,
+    "logging": {
+      "enable_central_tensorboard": true,
+      "enable_local_tensorboard": true,
+      "metrics_flush_every_steps": 1,
+      "metrics_writer": "persistent_jsonl_handle"
+    },
+    "lr": 0.0002,
+    "lr_schedule": "linear_warmup_cosine",
+    "max_seq_len": 2500,
+    "max_steps": 10000,
+    "n_heads": 12,
+    "n_layers": 12,
+    "optimizer": {
+      "backend": "torch",
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-08,
+      "implementation": "torch.optim.AdamW",
+      "learning_rate": 0.0002,
+      "state_precision": "full_precision",
+      "type": "adamw",
+      "weight_decay": 0.1
+    },
+    "optimizer_backend": "torch",
+    "optimizer_implementation": "torch.optim.AdamW",
+    "optimizer_state_precision": "full_precision",
+    "optimizer_type": "adamw",
+    "peak_lr": 0.0002,
+    "repro": {
+      "attention_kernel_policy": "auto",
+      "cublas_workspace_config": null,
+      "cudnn_benchmark": true,
+      "cudnn_deterministic": false,
+      "deterministic_algorithms": false,
+      "flash_sdp_enabled": true,
+      "math_sdp_enabled": true,
+      "mem_efficient_sdp_enabled": true,
+      "pythonhashseed": "1337",
+      "seed": 1337
+    },
+    "requested_precision": "bf16",
+    "save_every_steps": 500,
+    "scheduler": {
+      "decay_steps": 9850,
+      "final_lr": 1e-05,
+      "peak_lr": 0.0002,
+      "schedule_type": "linear_warmup_cosine",
+      "stable_steps": 0,
+      "total_steps": 10000,
+      "warmup_steps": 150
+    },
+    "seed": 1337,
+    "stable_steps": 0,
+    "train_cache_ram_bytes": 1073741824,
+    "train_cache_ram_mb": 1024,
+    "vocab_size": 32000,
+    "warmup_steps": 150,
+    "weight_decay": 0.1
+  },
+  "checkpoint_path": "/mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7/step_9000.pt",
+  "exported_at": "2026-05-15T09:49:45.812469+00:00",
+  "format": "llm-nanochat-safetensors-export",
+  "global_step": 9000,
+  "metadata_path": "/mnt/apps/llm-nanochat/hf_exports/gpt2small-en-it-nanochat-lr2e4-batchmaxpossible-bs7-step9000/step_9000.safetensors.json",
+  "model_config": {
+    "dim": 768,
+    "max_seq_len": 2500,
+    "n_heads": 12,
+    "n_layers": 12,
+    "vocab_size": 32000
+  },
+  "num_parameters": 136128000,
+  "num_tensors": 149,
+  "provenance": {
+    "checkpoint_dir": "/mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7",
+    "checkpoint_name": "step_9000.pt",
+    "checkpoint_path": "/mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7/step_9000.pt",
+    "global_step": 9000,
+    "packed_dataset_config_path": null,
+    "run_dir": "/mnt/apps/llm-nanochat/checkpoints",
+    "tokenizer_dir": "/mnt/apps/llm-nanochat/tokenizers/tok_20260515_fresh_50_50_score100_500m_32k_fromscratch",
+    "training_config_path": null
+  },
+  "safetensors_path": "/mnt/apps/llm-nanochat/hf_exports/gpt2small-en-it-nanochat-lr2e4-batchmaxpossible-bs7-step9000/step_9000.safetensors",
+  "source_checkpoint_path": "/mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7/step_9000.pt",
+  "source_global_step": 9000,
+  "tensor_names": [
+    "token_emb.weight",
+    "pos_emb.weight",
+    "blocks.layers.0.self_attn.in_proj_weight",
+    "blocks.layers.0.self_attn.in_proj_bias",
+    "blocks.layers.0.self_attn.out_proj.weight",
+    "blocks.layers.0.self_attn.out_proj.bias",
+    "blocks.layers.0.linear1.weight",
+    "blocks.layers.0.linear1.bias",
+    "blocks.layers.0.linear2.weight",
+    "blocks.layers.0.linear2.bias",
+    "blocks.layers.0.norm1.weight",
+    "blocks.layers.0.norm1.bias",
+    "blocks.layers.0.norm2.weight",
+    "blocks.layers.0.norm2.bias",
+    "blocks.layers.1.self_attn.in_proj_weight",
+    "blocks.layers.1.self_attn.in_proj_bias",
+    "blocks.layers.1.self_attn.out_proj.weight",
+    "blocks.layers.1.self_attn.out_proj.bias",
+    "blocks.layers.1.linear1.weight",
+    "blocks.layers.1.linear1.bias",
+    "blocks.layers.1.linear2.weight",
+    "blocks.layers.1.linear2.bias",
+    "blocks.layers.1.norm1.weight",
+    "blocks.layers.1.norm1.bias",
+    "blocks.layers.1.norm2.weight",
+    "blocks.layers.1.norm2.bias",
+    "blocks.layers.2.self_attn.in_proj_weight",
+    "blocks.layers.2.self_attn.in_proj_bias",
+    "blocks.layers.2.self_attn.out_proj.weight",
+    "blocks.layers.2.self_attn.out_proj.bias",
+    "blocks.layers.2.linear1.weight",
+    "blocks.layers.2.linear1.bias",
+    "blocks.layers.2.linear2.weight",
+    "blocks.layers.2.linear2.bias",
+    "blocks.layers.2.norm1.weight",
+    "blocks.layers.2.norm1.bias",
+    "blocks.layers.2.norm2.weight",
+    "blocks.layers.2.norm2.bias",
+    "blocks.layers.3.self_attn.in_proj_weight",
+    "blocks.layers.3.self_attn.in_proj_bias",
+    "blocks.layers.3.self_attn.out_proj.weight",
+    "blocks.layers.3.self_attn.out_proj.bias",
+    "blocks.layers.3.linear1.weight",
+    "blocks.layers.3.linear1.bias",
+    "blocks.layers.3.linear2.weight",
+    "blocks.layers.3.linear2.bias",
+    "blocks.layers.3.norm1.weight",
+    "blocks.layers.3.norm1.bias",
+    "blocks.layers.3.norm2.weight",
+    "blocks.layers.3.norm2.bias",
+    "blocks.layers.4.self_attn.in_proj_weight",
+    "blocks.layers.4.self_attn.in_proj_bias",
+    "blocks.layers.4.self_attn.out_proj.weight",
+    "blocks.layers.4.self_attn.out_proj.bias",
+    "blocks.layers.4.linear1.weight",
+    "blocks.layers.4.linear1.bias",
+    "blocks.layers.4.linear2.weight",
+    "blocks.layers.4.linear2.bias",
+    "blocks.layers.4.norm1.weight",
+    "blocks.layers.4.norm1.bias",
+    "blocks.layers.4.norm2.weight",
+    "blocks.layers.4.norm2.bias",
+    "blocks.layers.5.self_attn.in_proj_weight",
+    "blocks.layers.5.self_attn.in_proj_bias",
+    "blocks.layers.5.self_attn.out_proj.weight",
+    "blocks.layers.5.self_attn.out_proj.bias",
+    "blocks.layers.5.linear1.weight",
+    "blocks.layers.5.linear1.bias",
+    "blocks.layers.5.linear2.weight",
+    "blocks.layers.5.linear2.bias",
+    "blocks.layers.5.norm1.weight",
+    "blocks.layers.5.norm1.bias",
+    "blocks.layers.5.norm2.weight",
+    "blocks.layers.5.norm2.bias",
+    "blocks.layers.6.self_attn.in_proj_weight",
+    "blocks.layers.6.self_attn.in_proj_bias",
+    "blocks.layers.6.self_attn.out_proj.weight",
+    "blocks.layers.6.self_attn.out_proj.bias",
+    "blocks.layers.6.linear1.weight",
+    "blocks.layers.6.linear1.bias",
+    "blocks.layers.6.linear2.weight",
+    "blocks.layers.6.linear2.bias",
+    "blocks.layers.6.norm1.weight",
+    "blocks.layers.6.norm1.bias",
+    "blocks.layers.6.norm2.weight",
+    "blocks.layers.6.norm2.bias",
+    "blocks.layers.7.self_attn.in_proj_weight",
+    "blocks.layers.7.self_attn.in_proj_bias",
+    "blocks.layers.7.self_attn.out_proj.weight",
+    "blocks.layers.7.self_attn.out_proj.bias",
+    "blocks.layers.7.linear1.weight",
+    "blocks.layers.7.linear1.bias",
+    "blocks.layers.7.linear2.weight",
+    "blocks.layers.7.linear2.bias",
+    "blocks.layers.7.norm1.weight",
+    "blocks.layers.7.norm1.bias",
+    "blocks.layers.7.norm2.weight",
+    "blocks.layers.7.norm2.bias",
+    "blocks.layers.8.self_attn.in_proj_weight",
+    "blocks.layers.8.self_attn.in_proj_bias",
+    "blocks.layers.8.self_attn.out_proj.weight",
+    "blocks.layers.8.self_attn.out_proj.bias",
+    "blocks.layers.8.linear1.weight",
+    "blocks.layers.8.linear1.bias",
+    "blocks.layers.8.linear2.weight",
+    "blocks.layers.8.linear2.bias",
+    "blocks.layers.8.norm1.weight",
+    "blocks.layers.8.norm1.bias",
+    "blocks.layers.8.norm2.weight",
+    "blocks.layers.8.norm2.bias",
+    "blocks.layers.9.self_attn.in_proj_weight",
+    "blocks.layers.9.self_attn.in_proj_bias",
+    "blocks.layers.9.self_attn.out_proj.weight",
+    "blocks.layers.9.self_attn.out_proj.bias",
+    "blocks.layers.9.linear1.weight",
+    "blocks.layers.9.linear1.bias",
+    "blocks.layers.9.linear2.weight",
+    "blocks.layers.9.linear2.bias",
+    "blocks.layers.9.norm1.weight",
+    "blocks.layers.9.norm1.bias",
+    "blocks.layers.9.norm2.weight",
+    "blocks.layers.9.norm2.bias",
+    "blocks.layers.10.self_attn.in_proj_weight",
+    "blocks.layers.10.self_attn.in_proj_bias",
+    "blocks.layers.10.self_attn.out_proj.weight",
+    "blocks.layers.10.self_attn.out_proj.bias",
+    "blocks.layers.10.linear1.weight",
+    "blocks.layers.10.linear1.bias",
+    "blocks.layers.10.linear2.weight",
+    "blocks.layers.10.linear2.bias",
+    "blocks.layers.10.norm1.weight",
+    "blocks.layers.10.norm1.bias",
+    "blocks.layers.10.norm2.weight",
+    "blocks.layers.10.norm2.bias",
+    "blocks.layers.11.self_attn.in_proj_weight",
+    "blocks.layers.11.self_attn.in_proj_bias",
+    "blocks.layers.11.self_attn.out_proj.weight",
+    "blocks.layers.11.self_attn.out_proj.bias",
+    "blocks.layers.11.linear1.weight",
+    "blocks.layers.11.linear1.bias",
+    "blocks.layers.11.linear2.weight",
+    "blocks.layers.11.linear2.bias",
+    "blocks.layers.11.norm1.weight",
+    "blocks.layers.11.norm1.bias",
+    "blocks.layers.11.norm2.weight",
+    "blocks.layers.11.norm2.bias",
+    "ln_f.weight",
+    "ln_f.bias",
+    "head.weight"
+  ],
+  "tokenizer_reference": {
+    "packed_dataset_config_path": null,
+    "tokenizer_dir": "/mnt/apps/llm-nanochat/tokenizers/tok_20260515_fresh_50_50_score100_500m_32k_fromscratch",
+    "training_config_path": null
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_meta.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size_requested": 32000,
+  "vocab_size_actual": 32000,
+  "special_tokens": [
+    "<pad>",
+    "<bos>",
+    "<eos>",
+    "<unk>"
+  ]
+}

training_config.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Derived from configs/stable-config-recipe-v2-gpt2small.yaml
+# Purpose: GPT-2-small stable v2 variant with lr=2e-4 and final_lr at 5% of peak.
+dataset_dir: /mnt/apps/llm-nanochat/datasets/202605011052_fresh_50_50_score100_2500_sourcebalanced
+output_dir: /mnt/apps/llm-nanochat/artifacts/runs/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7
+tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch
+seed: 1337
+model:
+  vocab_size: 32000
+  dim: 768
+  n_layers: 12
+  n_heads: 12
+training:
+  sequence_length: 2500
+  max_steps: 10000
+  batch_size: 7
+  grad_accum_steps: 16
+  learning_rate: 0.0002
+  peak_lr: 0.0002
+  lr_schedule: linear_warmup_cosine
+  warmup_steps: -1
+  final_lr: 1.0e-05
+  adamw_betas:
+  - 0.9
+  - 0.95
+  adamw_eps: 1.0e-08
+  weight_decay: 0.1
+  clip_grad_norm: 1.0
+  save_every_steps: 500
+  checkpoint_dir: /mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7
+  precision: bf16
+evaluation:
+  validation_every_steps: 1000
+  validation_max_batches: 128
+  probe_every_steps: 1000
+  probe_tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch
+  probe_max_new_tokens: 32
+  probe_prompts:
+    en:
+      - prompt: "The capital of Italy is"
+        expected_next_text: " Rome"
+      - prompt: "A small language model should"
+        expected_next_text: " be"
+    it:
+      - prompt: "La capitale d'Italia è"
+        expected_next_text: " Roma"
+      - prompt: "Un piccolo modello linguistico dovrebbe"
+        expected_next_text: " essere"