nazdef
/

gpt2small-en-it-nanochat-lr2e4-bs7-step9000

Text Generation

Model card Files Files and versions

gpt2small-en-it-nanochat-lr2e4-bs7-step9000 / training_config.yaml

nazdef's picture

Upload bs7 best checkpoint step_9000

b162fdc verified about 1 month ago

history blame contribute delete

1.6 kB

	# Derived from configs/stable-config-recipe-v2-gpt2small.yaml
	# Purpose: GPT-2-small stable v2 variant with lr=2e-4 and final_lr at 5% of peak.

	dataset_dir: /mnt/apps/llm-nanochat/datasets/202605011052_fresh_50_50_score100_2500_sourcebalanced
	output_dir: /mnt/apps/llm-nanochat/artifacts/runs/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7
	tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch
	seed: 1337
	model:
	vocab_size: 32000
	dim: 768
	n_layers: 12
	n_heads: 12
	training:
	sequence_length: 2500
	max_steps: 10000
	batch_size: 7
	grad_accum_steps: 16
	learning_rate: 0.0002
	peak_lr: 0.0002
	lr_schedule: linear_warmup_cosine
	warmup_steps: -1
	final_lr: 1.0e-05
	adamw_betas:
	- 0.9
	- 0.95
	adamw_eps: 1.0e-08
	weight_decay: 0.1
	clip_grad_norm: 1.0
	save_every_steps: 500
	checkpoint_dir: /mnt/apps/llm-nanochat/checkpoints/stable-config-recipe-v3-gpt2small-lr2e4-batchmaxpossible-bs7
	precision: bf16
	evaluation:
	validation_every_steps: 1000
	validation_max_batches: 128
	probe_every_steps: 1000
	probe_tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch
	probe_max_new_tokens: 32
	probe_prompts:
	en:
	- prompt: "The capital of Italy is"
	expected_next_text: " Rome"
	- prompt: "A small language model should"
	expected_next_text: " be"
	it:
	- prompt: "La capitale d'Italia è"
	expected_next_text: " Roma"
	- prompt: "Un piccolo modello linguistico dovrebbe"
	expected_next_text: " essere"