| # Derived from configs/stable-config-recipe-v2-gpt2small.yaml | |
| # Purpose: GPT-2-small stable v2 variant with lr=2e-4 and final_lr at 5% of peak. | |
| dataset_dir: /mnt/apps/llm-nanochat/datasets/202605011052_fresh_50_50_score100_2500_sourcebalanced | |
| output_dir: /mnt/apps/llm-nanochat/artifacts/runs/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7 | |
| tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch | |
| seed: 1337 | |
| model: | |
| vocab_size: 32000 | |
| dim: 768 | |
| n_layers: 12 | |
| n_heads: 12 | |
| training: | |
| sequence_length: 2500 | |
| max_steps: 10000 | |
| batch_size: 7 | |
| grad_accum_steps: 16 | |
| learning_rate: 0.0002 | |
| peak_lr: 0.0002 | |
| lr_schedule: linear_warmup_cosine | |
| warmup_steps: -1 | |
| final_lr: 1.0e-05 | |
| adamw_betas: | |
| - 0.9 | |
| - 0.95 | |
| adamw_eps: 1.0e-08 | |
| weight_decay: 0.1 | |
| clip_grad_norm: 1.0 | |
| save_every_steps: 500 | |
| checkpoint_dir: /mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7 | |
| precision: bf16 | |
| evaluation: | |
| validation_every_steps: 1000 | |
| validation_max_batches: 128 | |
| probe_every_steps: 1000 | |
| probe_tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch | |
| probe_max_new_tokens: 32 | |
| probe_prompts: | |
| en: | |
| - prompt: "The capital of Italy is" | |
| expected_next_text: " Rome" | |
| - prompt: "A small language model should" | |
| expected_next_text: " be" | |
| it: | |
| - prompt: "La capitale d'Italia è" | |
| expected_next_text: " Roma" | |
| - prompt: "Un piccolo modello linguistico dovrebbe" | |
| expected_next_text: " essere" | |