nazdef's picture
Upload bs7 best checkpoint step_9000
b162fdc verified
# Derived from configs/stable-config-recipe-v2-gpt2small.yaml
# Purpose: GPT-2-small stable v2 variant with lr=2e-4 and final_lr at 5% of peak.
dataset_dir: /mnt/apps/llm-nanochat/datasets/202605011052_fresh_50_50_score100_2500_sourcebalanced
output_dir: /mnt/apps/llm-nanochat/artifacts/runs/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7
tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch
seed: 1337
model:
vocab_size: 32000
dim: 768
n_layers: 12
n_heads: 12
training:
sequence_length: 2500
max_steps: 10000
batch_size: 7
grad_accum_steps: 16
learning_rate: 0.0002
peak_lr: 0.0002
lr_schedule: linear_warmup_cosine
warmup_steps: -1
final_lr: 1.0e-05
adamw_betas:
- 0.9
- 0.95
adamw_eps: 1.0e-08
weight_decay: 0.1
clip_grad_norm: 1.0
save_every_steps: 500
checkpoint_dir: /mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7
precision: bf16
evaluation:
validation_every_steps: 1000
validation_max_batches: 128
probe_every_steps: 1000
probe_tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch
probe_max_new_tokens: 32
probe_prompts:
en:
- prompt: "The capital of Italy is"
expected_next_text: " Rome"
- prompt: "A small language model should"
expected_next_text: " be"
it:
- prompt: "La capitale d'Italia è"
expected_next_text: " Roma"
- prompt: "Un piccolo modello linguistico dovrebbe"
expected_next_text: " essere"