nicholasKluge commited on
Commit
1645e8f
·
verified ·
1 Parent(s): 9fe2d03

Upload config_stage_1.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config_stage_1.yaml +94 -0
config_stage_1.yaml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Directory settings
2
+ checkpoint_dir: "/lustre/scratch/data/polyglot_datasets/portuguese/checkpoints/models/tucano_v2"
3
+ train_dataset_dir:
4
+ # Total: ~292B
5
+ # Portuguese Text (~180B, 61%) | English Text (~112B, 39%)
6
+ # Web Text (~268B, 91%)
7
+ - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2/3 # 90B (PT)
8
+ - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2/3 # 90B (PT)
9
+ - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/fineweb/3 # 88B (EN)
10
+ # Math Text (~24B, 9%)
11
+ - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/finemath/3 # 24B (EN)
12
+ val_dataset_dir: "/lustre/scratch/data/polyglot_datasets/portuguese/tokenized/validation"
13
+ dataset_type: "parquet"
14
+ cache_dir: "/lustre/mlnvme/data/polyglot/.cache"
15
+
16
+ # Data loading settings
17
+ pin_memory: true
18
+ num_workers_for_dataloader: 32
19
+ shuffle_dataset: true
20
+ mask_eos_token: false
21
+ mask_pad_token: false
22
+
23
+ # Model architecture settings
24
+ vocab_size: 49152
25
+ num_hidden_layers: 28
26
+ num_attention_heads: 16
27
+ num_key_value_heads: 8
28
+ head_dim: null
29
+ hidden_size: 1536
30
+ intermediate_size: 3072
31
+ max_position_embeddings: 4096
32
+ tie_word_embeddings: true
33
+ hidden_act: "silu"
34
+ output_hidden_states: false
35
+ attn_implementation: "flash_attention_2"
36
+ use_cache: false
37
+ no_rope_layer_interval: null
38
+ rope_theta: 50000.0
39
+ rope_scale_factor: null
40
+ rms_norm_eps: 0.000001
41
+
42
+ # Training settings
43
+ total_batch_size: 2097152
44
+ micro_batch_size: 16
45
+ eval_micro_batch_size: 8
46
+ num_train_epochs: 1
47
+ warmup_steps: 2000
48
+ max_learning_rate: 0.0007
49
+ min_learning_rate: 0.0
50
+ muon_learning_rate: 0.007
51
+ weight_decay: 0.1
52
+ beta1: 0.9
53
+ beta2: 0.95
54
+ eps: 0.00000001
55
+ lr_decay_type: "wsd"
56
+ use_neg_sqrt: false
57
+ lr_decay_iters_coef: 0.0
58
+ seed: 1337
59
+ max_steps: 100000
60
+ max_grad_norm: 1.0
61
+
62
+ # Precision and optimization settings
63
+ torch_compile: false
64
+ mat_mul_precision: "highest"
65
+ tf32: true
66
+ bf16: true
67
+ gradient_checkpointing: false
68
+ use_liger_kernel: true
69
+ static_graph: false
70
+
71
+ # Hub settings
72
+ push_to_hub: false
73
+ hub_token: null
74
+ hub_model_id: null
75
+
76
+ # Tokenizer and Reference model
77
+ tokenizer_name_or_path: "/lustre/scratch/data/polyglot_datasets/portuguese/checkpoints/tokenizers/sentencepiece"
78
+ chat_template_path: null
79
+ reference_model: "HuggingFaceTB/SmolLM2-360M"
80
+ continual_pretraining: false
81
+
82
+ # Checkpoint settings
83
+ resume_from_checkpoint: null
84
+ checkpointing_steps: 5000
85
+ begin_new_stage: false
86
+ stage_name: "stage1_ws"
87
+
88
+ # Miscellaneous settings
89
+ sanity_check: false
90
+ sanity_check_num_samples: 100000
91
+ wandb_token: null
92
+ wandb_id: "tucano-v2"
93
+ wandb_project: "Polyglot"
94
+ wandb_desc: "Developing LLMs for low-resource languages"