agrv commited on
Commit
7498f9f
·
verified ·
1 Parent(s): 71c05f6

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20251128_151948-j8dmy8fe/run-j8dmy8fe.wandb filter=lfs diff=lfs merge=lfs -text
.hydra/config.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compile: true
2
+ device: cuda
3
+ from_checkpoint: null
4
+ load_mtp_head_from_model: null
5
+ name: nanogpt
6
+ training:
7
+ random_seed: 13
8
+ batch_size: 256
9
+ device_batch_size: 1
10
+ sequence_length: 8192
11
+ num_iterations: 900
12
+ learning_rate: 0.0003
13
+ use_scheduler: false
14
+ save_model: true
15
+ save_optimizer: true
16
+ save_model_every: 100
17
+ val_loss_every: 100
18
+ val_tokens: 4194304
19
+ expname: llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
20
+ model:
21
+ name: mtp
22
+ beta: 0.0
23
+ gamma: 1
24
+ kl_algorithm: full
25
+ kl_type: forward
26
+ model:
27
+ _target_: mtp.models.mtp.MultiTokenLM
28
+ lm: ${lm.model}
29
+ circuit: ${circuit.model}
30
+ mt_head_kwargs: ${mt_head.hyperparameters}
31
+ init_from_lm_head: true
32
+ kl_type: ${model.kl_type}
33
+ kl_algorithm: ${model.kl_algorithm}
34
+ beta: 0
35
+ gamma: 0.9
36
+ circuit:
37
+ name: btree
38
+ n_token: 8
39
+ n_component: 32
40
+ n_repetition: 1
41
+ model:
42
+ _target_: mtp.models.circuits.CircuitModel
43
+ vocab_size: ${data.vocab_size}
44
+ n_token: ${circuit.n_token}
45
+ n_component: ${circuit.n_component}
46
+ n_repetition: ${circuit.n_repetition}
47
+ kind: btree
48
+ mt_head:
49
+ name: linear-evabyte
50
+ hyperparameters:
51
+ type: evabyte
52
+ n_embd: ${lm.n_embd}
53
+ transformer_n_head: ${lm.n_head}
54
+ transformer_n_layer: 0
55
+ expander_type: linear
56
+ expander_n_layer: 1
57
+ freeze_vocab_unembedding: false
58
+ share_sum_weights: false
59
+ contextual_hmm_weights: true
60
+ init_hmm_identity: true
61
+ adaptor:
62
+ name: none
63
+ hyperparameters: null
64
+ lm:
65
+ name: llama3-2-3b-byte
66
+ n_embd: 3072
67
+ n_head: 24
68
+ model:
69
+ _target_: mtp.models.lm.LM
70
+ lm: null
71
+ encoder_only: true
72
+ from_checkpoint: null
73
+ from_huggingface: benjamin/Llama3-2-3B-IT-Byte
74
+ adaptor_kwargs: ${adaptor.hyperparameters}
75
+ ref_enc: model
76
+ ref_head: lm_head
77
+ freeze: true
78
+ data:
79
+ name: tulu3-llama3
80
+ train_bin: agrv/tulu-v3-sft-llama3-packed-seq-len-8192
81
+ val_bin: null
82
+ vocab_size: 268
83
+ generate:
84
+ speculative: false
.hydra/hydra.yaml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - data=tulu3-llama3-packed
116
+ - training=tulu3-evabyte-1epoch
117
+ - lm=llama3-2-3b-byte
118
+ - model=mtp
119
+ - adaptor=none
120
+ - mt_head=linear-evabyte
121
+ - circuit=btree
122
+ - circuit.n_token=8
123
+ - circuit.n_component=32
124
+ - circuit.n_repetition=1
125
+ - training.device_batch_size=1
126
+ - model.model.beta=0
127
+ - model.model.gamma=0.9
128
+ - data.val_bin=null
129
+ - training.learning_rate=0.0003
130
+ - training.expname=llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
131
+ job:
132
+ name: ${name}
133
+ chdir: true
134
+ override_dirname: adaptor=none,circuit.n_component=32,circuit.n_repetition=1,circuit.n_token=8,circuit=btree,data.val_bin=null,data=tulu3-llama3-packed,lm=llama3-2-3b-byte,model.model.beta=0,model.model.gamma=0.9,model=mtp,mt_head=linear-evabyte,training.device_batch_size=1,training.expname=llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1,training.learning_rate=0.0003,training=tulu3-evabyte-1epoch
135
+ id: ???
136
+ num: ???
137
+ config_name: config
138
+ env_set: {}
139
+ env_copy: []
140
+ config:
141
+ override_dirname:
142
+ kv_sep: '='
143
+ item_sep: ','
144
+ exclude_keys: []
145
+ runtime:
146
+ version: 1.3.2
147
+ version_base: '1.3'
148
+ cwd: /disk/scratch/agrivas/nanoGPT
149
+ config_sources:
150
+ - path: hydra.conf
151
+ schema: pkg
152
+ provider: hydra
153
+ - path: /disk/scratch/agrivas/nanoGPT/configs
154
+ schema: file
155
+ provider: main
156
+ - path: ''
157
+ schema: structured
158
+ provider: schema
159
+ output_dir: /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34
160
+ choices:
161
+ generate: default
162
+ data: tulu3-llama3-packed
163
+ lm: llama3-2-3b-byte
164
+ adaptor: none
165
+ mt_head: linear-evabyte
166
+ circuit: btree
167
+ model: mtp
168
+ training: tulu3-evabyte-1epoch
169
+ hydra/env: default
170
+ hydra/callbacks: null
171
+ hydra/job_logging: default
172
+ hydra/hydra_logging: default
173
+ hydra/hydra_help: default
174
+ hydra/help: default
175
+ hydra/sweeper: basic
176
+ hydra/launcher: basic
177
+ hydra/output: default
178
+ verbose: false
.hydra/overrides.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - data=tulu3-llama3-packed
2
+ - training=tulu3-evabyte-1epoch
3
+ - lm=llama3-2-3b-byte
4
+ - model=mtp
5
+ - adaptor=none
6
+ - mt_head=linear-evabyte
7
+ - circuit=btree
8
+ - circuit.n_token=8
9
+ - circuit.n_component=32
10
+ - circuit.n_repetition=1
11
+ - training.device_batch_size=1
12
+ - model.model.beta=0
13
+ - model.model.gamma=0.9
14
+ - data.val_bin=null
15
+ - training.learning_rate=0.0003
16
+ - training.expname=llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
config.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compile: true
2
+ device: cuda
3
+ from_checkpoint: null
4
+ load_mtp_head_from_model: null
5
+ name: nanogpt
6
+ training:
7
+ random_seed: 13
8
+ batch_size: 256
9
+ device_batch_size: 1
10
+ sequence_length: 8192
11
+ num_iterations: 900
12
+ learning_rate: 0.0003
13
+ use_scheduler: false
14
+ save_model: true
15
+ save_optimizer: true
16
+ save_model_every: 100
17
+ val_loss_every: 100
18
+ val_tokens: 4194304
19
+ expname: llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
20
+ model:
21
+ name: mtp
22
+ beta: 0.0
23
+ gamma: 1
24
+ kl_algorithm: full
25
+ kl_type: forward
26
+ model:
27
+ _target_: mtp.models.mtp.MultiTokenLM
28
+ lm: ${lm.model}
29
+ circuit: ${circuit.model}
30
+ mt_head_kwargs: ${mt_head.hyperparameters}
31
+ init_from_lm_head: true
32
+ kl_type: ${model.kl_type}
33
+ kl_algorithm: ${model.kl_algorithm}
34
+ beta: 0
35
+ gamma: 0.9
36
+ circuit:
37
+ name: btree
38
+ n_token: 8
39
+ n_component: 32
40
+ n_repetition: 1
41
+ model:
42
+ _target_: mtp.models.circuits.CircuitModel
43
+ vocab_size: 268
44
+ n_token: 8
45
+ n_component: 32
46
+ n_repetition: 1
47
+ kind: btree
48
+ mt_head:
49
+ name: linear-evabyte
50
+ hyperparameters:
51
+ type: evabyte
52
+ n_embd: 3072
53
+ transformer_n_head: 24
54
+ transformer_n_layer: 0
55
+ expander_type: linear
56
+ expander_n_layer: 1
57
+ freeze_vocab_unembedding: false
58
+ share_sum_weights: false
59
+ contextual_hmm_weights: true
60
+ init_hmm_identity: true
61
+ adaptor:
62
+ name: none
63
+ hyperparameters: null
64
+ lm:
65
+ name: llama3-2-3b-byte
66
+ n_embd: 3072
67
+ n_head: 24
68
+ model:
69
+ _target_: mtp.models.lm.LM
70
+ lm: null
71
+ encoder_only: true
72
+ from_checkpoint: null
73
+ from_huggingface: benjamin/Llama3-2-3B-IT-Byte
74
+ adaptor_kwargs: null
75
+ ref_enc: model
76
+ ref_head: lm_head
77
+ freeze: true
78
+ data:
79
+ name: tulu3-llama3
80
+ train_bin: agrv/tulu-v3-sft-llama3-packed-seq-len-8192
81
+ val_bin: null
82
+ vocab_size: 268
83
+ generate:
84
+ speculative: false
85
+ expname: llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
86
+ wandb_run_id: j8dmy8fe
model@0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3e5b206d0e1714b7a7de472381e46aa3857906c345e8b28588543d9cbb52222
3
+ size 459529775
model@100.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39cc119c551ec1d20a4121d1bb4f5d69254964a902e857c705755c858a5c34e6
3
+ size 1378505919
model@200.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c17e7e509c748c8746f695c00ca2cdca18b970afbfd141f361224d22058b08
3
+ size 1378505919
model@300.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7234dc3f26fa576a96d80fc432ffd04d292163521be1ccdec18d79bbec8cfdeb
3
+ size 1378505919
model@400.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c32c516042adb0c08905ce13e25da28a2eb2d00ad351e1e87776d003d0057390
3
+ size 1378505919
model@500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ad295cb91cd532f1dccef90022f38c8836eb0093bb15996b02066d99f39503f
3
+ size 1378505919
model@600.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fec1ac6807b92be707d1751ba7c69e4bdb164bdef960fafebd27cfe7a16476cb
3
+ size 1378505919
model@700.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f11675d16350c4a3e7f5df171057f935fb4fae4e9ae5dcf2583557c2ad2a4857
3
+ size 1378505919
model@800.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd994882e26b15914176e7ae26a8870c383765c7249c55c6ae387226297048e
3
+ size 1378505919
model@900.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:182fc1b689cf28a6b695f15e729f7e60c55544117836a7c365f61f9925a506a7
3
+ size 1378505919
nanogpt.log ADDED
@@ -0,0 +1,916 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-11-28 15:19:40,898] - Setting up model... compile=True...
2
+ [2025-11-28 15:19:49,942] - Saving config and checkpoints to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34...
3
+ [2025-11-28 15:19:49,942] - Save model: True...
4
+ [2025-11-28 15:19:49,943] - Save optimizer: True...
5
+ [2025-11-28 15:19:49,950] - Training on agrv/tulu-v3-sft-llama3-packed-seq-len-8192...
6
+ [2025-11-28 15:20:29,738] - Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
7
+ [2025-11-28 15:20:31,881] - step:0/900 Saving model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@0.pt...
8
+ [2025-11-28 15:24:47,442] - step:1/900 train_loss:4.6532 lr:0.0003000000 time/step:254.93s
9
+ [2025-11-28 15:27:51,454] - step:2/900 train_loss:4.2966 lr:0.0003000000 time/step:184.01s
10
+ [2025-11-28 15:30:57,835] - step:3/900 train_loss:3.9828 lr:0.0003000000 time/step:186.37s
11
+ [2025-11-28 15:34:08,017] - step:4/900 train_loss:3.6910 lr:0.0003000000 time/step:190.16s
12
+ [2025-11-28 15:37:15,451] - step:5/900 train_loss:3.4752 lr:0.0003000000 time/step:187.40s
13
+ [2025-11-28 15:40:23,150] - step:6/900 train_loss:3.3047 lr:0.0003000000 time/step:187.69s
14
+ [2025-11-28 15:43:30,606] - step:7/900 train_loss:3.1140 lr:0.0003000000 time/step:187.45s
15
+ [2025-11-28 15:46:34,854] - step:8/900 train_loss:2.9731 lr:0.0003000000 time/step:184.24s
16
+ [2025-11-28 15:49:37,899] - step:9/900 train_loss:2.8709 lr:0.0003000000 time/step:183.04s
17
+ [2025-11-28 15:52:42,161] - step:10/900 train_loss:2.7582 lr:0.0003000000 time/step:184.25s
18
+ [2025-11-28 15:55:49,659] - step:11/900 train_loss:2.6474 lr:0.0003000000 time/step:187.49s
19
+ [2025-11-28 15:58:56,730] - step:12/900 train_loss:2.5890 lr:0.0003000000 time/step:187.06s
20
+ [2025-11-28 16:02:04,893] - step:13/900 train_loss:2.5418 lr:0.0003000000 time/step:188.16s
21
+ [2025-11-28 16:05:11,642] - step:14/900 train_loss:2.4586 lr:0.0003000000 time/step:186.74s
22
+ [2025-11-28 16:08:18,599] - step:15/900 train_loss:2.3908 lr:0.0003000000 time/step:186.94s
23
+ [2025-11-28 16:11:24,210] - step:16/900 train_loss:2.3323 lr:0.0003000000 time/step:185.60s
24
+ [2025-11-28 16:14:28,421] - step:17/900 train_loss:2.2802 lr:0.0003000000 time/step:184.20s
25
+ [2025-11-28 16:17:31,315] - step:18/900 train_loss:2.2268 lr:0.0003000000 time/step:182.88s
26
+ [2025-11-28 16:20:33,212] - step:19/900 train_loss:2.2212 lr:0.0003000000 time/step:181.88s
27
+ [2025-11-28 16:23:39,339] - step:20/900 train_loss:2.1965 lr:0.0003000000 time/step:186.12s
28
+ [2025-11-28 16:26:46,366] - step:21/900 train_loss:2.1549 lr:0.0003000000 time/step:187.01s
29
+ [2025-11-28 16:29:53,452] - step:22/900 train_loss:2.0844 lr:0.0003000000 time/step:187.07s
30
+ [2025-11-28 16:33:01,636] - step:23/900 train_loss:2.0673 lr:0.0003000000 time/step:188.18s
31
+ [2025-11-28 16:36:09,091] - step:24/900 train_loss:2.0375 lr:0.0003000000 time/step:187.44s
32
+ [2025-11-28 16:39:16,657] - step:25/900 train_loss:2.0299 lr:0.0003000000 time/step:187.55s
33
+ [2025-11-28 16:42:25,561] - step:26/900 train_loss:1.9910 lr:0.0003000000 time/step:188.90s
34
+ [2025-11-28 16:45:33,283] - step:27/900 train_loss:1.9708 lr:0.0003000000 time/step:187.68s
35
+ [2025-11-28 16:48:40,284] - step:28/900 train_loss:1.9105 lr:0.0003000000 time/step:186.98s
36
+ [2025-11-28 16:51:47,205] - step:29/900 train_loss:1.9014 lr:0.0003000000 time/step:186.92s
37
+ [2025-11-28 16:54:56,592] - step:30/900 train_loss:1.8643 lr:0.0003000000 time/step:189.38s
38
+ [2025-11-28 16:58:04,452] - step:31/900 train_loss:1.8593 lr:0.0003000000 time/step:187.84s
39
+ [2025-11-28 17:01:11,681] - step:32/900 train_loss:1.8733 lr:0.0003000000 time/step:187.21s
40
+ [2025-11-28 17:04:19,862] - step:33/900 train_loss:1.7975 lr:0.0003000000 time/step:188.17s
41
+ [2025-11-28 17:07:27,610] - step:34/900 train_loss:1.8307 lr:0.0003000000 time/step:187.74s
42
+ [2025-11-28 17:10:35,249] - step:35/900 train_loss:1.8018 lr:0.0003000000 time/step:187.63s
43
+ [2025-11-28 17:13:46,950] - step:36/900 train_loss:1.8066 lr:0.0003000000 time/step:191.69s
44
+ [2025-11-28 17:16:53,853] - step:37/900 train_loss:1.7636 lr:0.0003000000 time/step:186.82s
45
+ [2025-11-28 17:20:00,571] - step:38/900 train_loss:1.7714 lr:0.0003000000 time/step:186.54s
46
+ [2025-11-28 17:23:13,248] - step:39/900 train_loss:1.7096 lr:0.0003000000 time/step:192.65s
47
+ [2025-11-28 17:26:19,575] - step:40/900 train_loss:1.7411 lr:0.0003000000 time/step:186.29s
48
+ [2025-11-28 17:29:25,904] - step:41/900 train_loss:1.6913 lr:0.0003000000 time/step:186.27s
49
+ [2025-11-28 17:32:44,977] - step:42/900 train_loss:1.7001 lr:0.0003000000 time/step:199.05s
50
+ [2025-11-28 17:35:51,243] - step:43/900 train_loss:1.6629 lr:0.0003000000 time/step:186.21s
51
+ [2025-11-28 17:38:57,339] - step:44/900 train_loss:1.6610 lr:0.0003000000 time/step:185.79s
52
+ [2025-11-28 17:42:05,062] - step:45/900 train_loss:1.6524 lr:0.0003000000 time/step:187.68s
53
+ [2025-11-28 17:45:20,648] - step:46/900 train_loss:1.6555 lr:0.0003000000 time/step:195.50s
54
+ [2025-11-28 17:48:26,366] - step:47/900 train_loss:1.6223 lr:0.0003000000 time/step:185.70s
55
+ [2025-11-28 17:51:34,666] - step:48/900 train_loss:1.6481 lr:0.0003000000 time/step:188.12s
56
+ [2025-11-28 17:54:51,245] - step:49/900 train_loss:1.6112 lr:0.0003000000 time/step:196.52s
57
+ [2025-11-28 17:57:57,507] - step:50/900 train_loss:1.6013 lr:0.0003000000 time/step:186.19s
58
+ [2025-11-28 18:01:05,674] - step:51/900 train_loss:1.5772 lr:0.0003000000 time/step:187.99s
59
+ [2025-11-28 18:04:21,278] - step:52/900 train_loss:1.5660 lr:0.0003000000 time/step:195.58s
60
+ [2025-11-28 18:07:27,447] - step:53/900 train_loss:1.5702 lr:0.0003000000 time/step:186.11s
61
+ [2025-11-28 18:10:33,793] - step:54/900 train_loss:1.5665 lr:0.0003000000 time/step:186.26s
62
+ [2025-11-28 18:13:53,962] - step:55/900 train_loss:1.5804 lr:0.0003000000 time/step:200.15s
63
+ [2025-11-28 18:17:00,673] - step:56/900 train_loss:1.5645 lr:0.0003000000 time/step:186.66s
64
+ [2025-11-28 18:20:06,961] - step:57/900 train_loss:1.5609 lr:0.0003000000 time/step:186.23s
65
+ [2025-11-28 18:23:24,919] - step:58/900 train_loss:1.5356 lr:0.0003000000 time/step:197.90s
66
+ [2025-11-28 18:26:31,137] - step:59/900 train_loss:1.5277 lr:0.0003000000 time/step:186.18s
67
+ [2025-11-28 18:29:37,442] - step:60/900 train_loss:1.5330 lr:0.0003000000 time/step:186.22s
68
+ [2025-11-28 18:32:45,572] - step:61/900 train_loss:1.5127 lr:0.0003000000 time/step:188.07s
69
+ [2025-11-28 18:36:01,349] - step:62/900 train_loss:1.5127 lr:0.0003000000 time/step:195.75s
70
+ [2025-11-28 18:39:08,044] - step:63/900 train_loss:1.5255 lr:0.0003000000 time/step:186.63s
71
+ [2025-11-28 18:42:16,514] - step:64/900 train_loss:1.4881 lr:0.0003000000 time/step:188.39s
72
+ [2025-11-28 18:45:32,575] - step:65/900 train_loss:1.4746 lr:0.0003000000 time/step:196.00s
73
+ [2025-11-28 18:48:39,543] - step:66/900 train_loss:1.5017 lr:0.0003000000 time/step:186.89s
74
+ [2025-11-28 18:51:47,768] - step:67/900 train_loss:1.4805 lr:0.0003000000 time/step:188.07s
75
+ [2025-11-28 18:55:03,564] - step:68/900 train_loss:1.4929 lr:0.0003000000 time/step:195.75s
76
+ [2025-11-28 18:58:10,293] - step:69/900 train_loss:1.4550 lr:0.0003000000 time/step:186.67s
77
+ [2025-11-28 19:01:16,800] - step:70/900 train_loss:1.4532 lr:0.0003000000 time/step:186.44s
78
+ [2025-11-28 19:04:26,999] - step:71/900 train_loss:1.4520 lr:0.0003000000 time/step:190.18s
79
+ [2025-11-28 19:07:33,259] - step:72/900 train_loss:1.4301 lr:0.0003000000 time/step:186.22s
80
+ [2025-11-28 19:10:39,471] - step:73/900 train_loss:1.4337 lr:0.0003000000 time/step:186.20s
81
+ [2025-11-28 19:13:47,822] - step:74/900 train_loss:1.4296 lr:0.0003000000 time/step:188.33s
82
+ [2025-11-28 19:16:53,884] - step:75/900 train_loss:1.4294 lr:0.0003000000 time/step:186.04s
83
+ [2025-11-28 19:19:59,845] - step:76/900 train_loss:1.4367 lr:0.0003000000 time/step:185.94s
84
+ [2025-11-28 19:23:05,617] - step:77/900 train_loss:1.4359 lr:0.0003000000 time/step:185.76s
85
+ [2025-11-28 19:26:13,471] - step:78/900 train_loss:1.3907 lr:0.0003000000 time/step:187.84s
86
+ [2025-11-28 19:29:19,325] - step:79/900 train_loss:1.4074 lr:0.0003000000 time/step:185.83s
87
+ [2025-11-28 19:32:24,915] - step:80/900 train_loss:1.3818 lr:0.0003000000 time/step:185.57s
88
+ [2025-11-28 19:35:32,821] - step:81/900 train_loss:1.3966 lr:0.0003000000 time/step:187.89s
89
+ [2025-11-28 19:38:38,468] - step:82/900 train_loss:1.3767 lr:0.0003000000 time/step:185.62s
90
+ [2025-11-28 19:41:44,296] - step:83/900 train_loss:1.3772 lr:0.0003000000 time/step:185.82s
91
+ [2025-11-28 19:44:52,361] - step:84/900 train_loss:1.3639 lr:0.0003000000 time/step:188.06s
92
+ [2025-11-28 19:47:59,370] - step:85/900 train_loss:1.3910 lr:0.0003000000 time/step:186.99s
93
+ [2025-11-28 19:51:05,447] - step:86/900 train_loss:1.4013 lr:0.0003000000 time/step:186.07s
94
+ [2025-11-28 19:54:13,032] - step:87/900 train_loss:1.3883 lr:0.0003000000 time/step:187.58s
95
+ [2025-11-28 19:57:19,138] - step:88/900 train_loss:1.3712 lr:0.0003000000 time/step:186.09s
96
+ [2025-11-28 20:00:25,142] - step:89/900 train_loss:1.3749 lr:0.0003000000 time/step:185.98s
97
+ [2025-11-28 20:03:30,825] - step:90/900 train_loss:1.3630 lr:0.0003000000 time/step:185.67s
98
+ [2025-11-28 20:06:38,585] - step:91/900 train_loss:1.3713 lr:0.0003000000 time/step:187.75s
99
+ [2025-11-28 20:09:44,867] - step:92/900 train_loss:1.3503 lr:0.0003000000 time/step:186.27s
100
+ [2025-11-28 20:12:50,830] - step:93/900 train_loss:1.3537 lr:0.0003000000 time/step:185.94s
101
+ [2025-11-28 20:15:58,624] - step:94/900 train_loss:1.3468 lr:0.0003000000 time/step:187.79s
102
+ [2025-11-28 20:19:04,543] - step:95/900 train_loss:1.3603 lr:0.0003000000 time/step:185.91s
103
+ [2025-11-28 20:22:10,848] - step:96/900 train_loss:1.3216 lr:0.0003000000 time/step:186.29s
104
+ [2025-11-28 20:25:17,756] - step:97/900 train_loss:1.3276 lr:0.0003000000 time/step:186.90s
105
+ [2025-11-28 20:28:22,895] - step:98/900 train_loss:1.3128 lr:0.0003000000 time/step:185.09s
106
+ [2025-11-28 20:31:28,093] - step:99/900 train_loss:1.3014 lr:0.0003000000 time/step:185.13s
107
+ [2025-11-28 20:34:37,788] - step:100/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@100.pt...
108
+ [2025-11-28 20:34:37,813] - step:100/900 train_loss:1.3411 lr:0.0003000000 time/step:187.79s
109
+ [2025-11-28 20:37:43,371] - step:101/900 train_loss:1.3414 lr:0.0003000000 time/step:185.55s
110
+ [2025-11-28 20:40:49,246] - step:102/900 train_loss:1.3098 lr:0.0003000000 time/step:185.84s
111
+ [2025-11-28 20:43:55,147] - step:103/900 train_loss:1.3077 lr:0.0003000000 time/step:185.90s
112
+ [2025-11-28 20:47:03,589] - step:104/900 train_loss:1.3283 lr:0.0003000000 time/step:188.43s
113
+ [2025-11-28 20:50:09,456] - step:105/900 train_loss:1.3107 lr:0.0003000000 time/step:185.85s
114
+ [2025-11-28 20:53:15,133] - step:106/900 train_loss:1.3116 lr:0.0003000000 time/step:185.65s
115
+ [2025-11-28 20:56:23,079] - step:107/900 train_loss:1.3076 lr:0.0003000000 time/step:187.94s
116
+ [2025-11-28 20:59:29,309] - step:108/900 train_loss:1.2576 lr:0.0003000000 time/step:186.19s
117
+ [2025-11-28 21:02:34,096] - step:109/900 train_loss:1.3163 lr:0.0003000000 time/step:184.77s
118
+ [2025-11-28 21:05:38,534] - step:110/900 train_loss:1.2836 lr:0.0003000000 time/step:184.43s
119
+ [2025-11-28 21:08:41,909] - step:111/900 train_loss:1.2887 lr:0.0003000000 time/step:183.34s
120
+ [2025-11-28 21:11:44,652] - step:112/900 train_loss:1.2900 lr:0.0003000000 time/step:182.72s
121
+ [2025-11-28 21:14:49,050] - step:113/900 train_loss:1.3032 lr:0.0003000000 time/step:184.39s
122
+ [2025-11-28 21:17:51,714] - step:114/900 train_loss:1.2715 lr:0.0003000000 time/step:182.65s
123
+ [2025-11-28 21:20:54,366] - step:115/900 train_loss:1.2553 lr:0.0003000000 time/step:182.64s
124
+ [2025-11-28 21:23:58,585] - step:116/900 train_loss:1.2608 lr:0.0003000000 time/step:184.21s
125
+ [2025-11-28 21:27:05,711] - step:117/900 train_loss:1.2750 lr:0.0003000000 time/step:187.12s
126
+ [2025-11-28 21:30:10,632] - step:118/900 train_loss:1.2610 lr:0.0003000000 time/step:184.91s
127
+ [2025-11-28 21:33:15,980] - step:119/900 train_loss:1.2728 lr:0.0003000000 time/step:185.32s
128
+ [2025-11-28 21:36:22,993] - step:120/900 train_loss:1.2367 lr:0.0003000000 time/step:187.01s
129
+ [2025-11-28 21:39:27,798] - step:121/900 train_loss:1.2436 lr:0.0003000000 time/step:184.79s
130
+ [2025-11-28 21:42:32,716] - step:122/900 train_loss:1.2680 lr:0.0003000000 time/step:184.90s
131
+ [2025-11-28 21:45:39,837] - step:123/900 train_loss:1.2459 lr:0.0003000000 time/step:187.11s
132
+ [2025-11-28 21:48:44,604] - step:124/900 train_loss:1.2356 lr:0.0003000000 time/step:184.76s
133
+ [2025-11-28 21:51:49,462] - step:125/900 train_loss:1.2116 lr:0.0003000000 time/step:184.84s
134
+ [2025-11-28 21:54:56,152] - step:126/900 train_loss:1.2271 lr:0.0003000000 time/step:186.68s
135
+ [2025-11-28 21:58:02,698] - step:127/900 train_loss:1.2747 lr:0.0003000000 time/step:186.53s
136
+ [2025-11-28 22:01:07,919] - step:128/900 train_loss:1.2662 lr:0.0003000000 time/step:185.21s
137
+ [2025-11-28 22:04:13,473] - step:129/900 train_loss:1.2508 lr:0.0003000000 time/step:185.54s
138
+ [2025-11-28 22:07:19,897] - step:130/900 train_loss:1.2417 lr:0.0003000000 time/step:186.41s
139
+ [2025-11-28 22:10:24,163] - step:131/900 train_loss:1.2469 lr:0.0003000000 time/step:184.26s
140
+ [2025-11-28 22:13:29,588] - step:132/900 train_loss:1.2212 lr:0.0003000000 time/step:185.42s
141
+ [2025-11-28 22:16:36,724] - step:133/900 train_loss:1.2154 lr:0.0003000000 time/step:187.11s
142
+ [2025-11-28 22:19:41,361] - step:134/900 train_loss:1.1905 lr:0.0003000000 time/step:184.62s
143
+ [2025-11-28 22:22:46,426] - step:135/900 train_loss:1.2090 lr:0.0003000000 time/step:185.04s
144
+ [2025-11-28 22:25:53,482] - step:136/900 train_loss:1.2180 lr:0.0003000000 time/step:187.04s
145
+ [2025-11-28 22:28:58,396] - step:137/900 train_loss:1.2309 lr:0.0003000000 time/step:184.90s
146
+ [2025-11-28 22:32:02,953] - step:138/900 train_loss:1.2127 lr:0.0003000000 time/step:184.53s
147
+ [2025-11-28 22:35:08,685] - step:139/900 train_loss:1.2126 lr:0.0003000000 time/step:185.71s
148
+ [2025-11-28 22:38:15,825] - step:140/900 train_loss:1.2117 lr:0.0003000000 time/step:187.09s
149
+ [2025-11-28 22:41:20,366] - step:141/900 train_loss:1.2301 lr:0.0003000000 time/step:184.53s
150
+ [2025-11-28 22:44:24,896] - step:142/900 train_loss:1.2388 lr:0.0003000000 time/step:184.52s
151
+ [2025-11-28 22:47:31,624] - step:143/900 train_loss:1.1987 lr:0.0003000000 time/step:186.71s
152
+ [2025-11-28 22:50:37,358] - step:144/900 train_loss:1.2210 lr:0.0003000000 time/step:185.73s
153
+ [2025-11-28 22:53:43,613] - step:145/900 train_loss:1.2170 lr:0.0003000000 time/step:186.22s
154
+ [2025-11-28 22:57:06,629] - step:146/900 train_loss:1.2236 lr:0.0003000000 time/step:203.01s
155
+ [2025-11-28 23:00:09,814] - step:147/900 train_loss:1.2255 lr:0.0003000000 time/step:183.18s
156
+ [2025-11-28 23:03:14,149] - step:148/900 train_loss:1.1806 lr:0.0003000000 time/step:184.31s
157
+ [2025-11-28 23:06:23,397] - step:149/900 train_loss:1.2233 lr:0.0003000000 time/step:189.23s
158
+ [2025-11-28 23:09:30,162] - step:150/900 train_loss:1.1677 lr:0.0003000000 time/step:186.75s
159
+ [2025-11-28 23:12:34,786] - step:151/900 train_loss:1.2155 lr:0.0003000000 time/step:184.59s
160
+ [2025-11-28 23:15:41,431] - step:152/900 train_loss:1.1948 lr:0.0003000000 time/step:186.63s
161
+ [2025-11-28 23:18:47,806] - step:153/900 train_loss:1.1950 lr:0.0003000000 time/step:186.35s
162
+ [2025-11-28 23:21:52,115] - step:154/900 train_loss:1.2133 lr:0.0003000000 time/step:184.28s
163
+ [2025-11-28 23:24:56,981] - step:155/900 train_loss:1.1862 lr:0.0003000000 time/step:184.85s
164
+ [2025-11-28 23:28:03,290] - step:156/900 train_loss:1.1699 lr:0.0003000000 time/step:186.29s
165
+ [2025-11-28 23:31:07,306] - step:157/900 train_loss:1.1773 lr:0.0003000000 time/step:184.00s
166
+ [2025-11-28 23:34:12,414] - step:158/900 train_loss:1.1680 lr:0.0003000000 time/step:185.10s
167
+ [2025-11-28 23:37:19,900] - step:159/900 train_loss:1.1806 lr:0.0003000000 time/step:187.45s
168
+ [2025-11-28 23:40:24,615] - step:160/900 train_loss:1.1865 lr:0.0003000000 time/step:184.70s
169
+ [2025-11-28 23:43:29,245] - step:161/900 train_loss:1.1872 lr:0.0003000000 time/step:184.61s
170
+ [2025-11-28 23:46:36,811] - step:162/900 train_loss:1.1806 lr:0.0003000000 time/step:187.56s
171
+ [2025-11-28 23:49:41,637] - step:163/900 train_loss:1.1750 lr:0.0003000000 time/step:184.79s
172
+ [2025-11-28 23:52:45,829] - step:164/900 train_loss:1.1828 lr:0.0003000000 time/step:184.16s
173
+ [2025-11-28 23:55:50,721] - step:165/900 train_loss:1.1742 lr:0.0003000000 time/step:184.88s
174
+ [2025-11-28 23:58:57,667] - step:166/900 train_loss:1.1655 lr:0.0003000000 time/step:186.93s
175
+ [2025-11-29 00:02:02,656] - step:167/900 train_loss:1.1631 lr:0.0003000000 time/step:184.97s
176
+ [2025-11-29 00:05:08,306] - step:168/900 train_loss:1.1614 lr:0.0003000000 time/step:185.63s
177
+ [2025-11-29 00:08:15,208] - step:169/900 train_loss:1.1613 lr:0.0003000000 time/step:186.89s
178
+ [2025-11-29 00:11:19,829] - step:170/900 train_loss:1.1623 lr:0.0003000000 time/step:184.60s
179
+ [2025-11-29 00:14:25,137] - step:171/900 train_loss:1.1538 lr:0.0003000000 time/step:185.30s
180
+ [2025-11-29 00:17:32,364] - step:172/900 train_loss:1.1782 lr:0.0003000000 time/step:187.22s
181
+ [2025-11-29 00:20:37,216] - step:173/900 train_loss:1.1596 lr:0.0003000000 time/step:184.84s
182
+ [2025-11-29 00:23:42,361] - step:174/900 train_loss:1.1381 lr:0.0003000000 time/step:185.12s
183
+ [2025-11-29 00:26:49,327] - step:175/900 train_loss:1.1305 lr:0.0003000000 time/step:186.96s
184
+ [2025-11-29 00:29:54,460] - step:176/900 train_loss:1.1603 lr:0.0003000000 time/step:185.12s
185
+ [2025-11-29 00:32:59,491] - step:177/900 train_loss:1.1435 lr:0.0003000000 time/step:185.01s
186
+ [2025-11-29 00:36:04,756] - step:178/900 train_loss:1.1653 lr:0.0003000000 time/step:185.25s
187
+ [2025-11-29 00:39:11,804] - step:179/900 train_loss:1.1443 lr:0.0003000000 time/step:187.04s
188
+ [2025-11-29 00:42:16,834] - step:180/900 train_loss:1.1554 lr:0.0003000000 time/step:185.01s
189
+ [2025-11-29 00:45:22,795] - step:181/900 train_loss:1.1495 lr:0.0003000000 time/step:185.95s
190
+ [2025-11-29 00:48:30,739] - step:182/900 train_loss:1.1251 lr:0.0003000000 time/step:187.94s
191
+ [2025-11-29 00:51:34,795] - step:183/900 train_loss:1.1323 lr:0.0003000000 time/step:184.04s
192
+ [2025-11-29 00:54:39,599] - step:184/900 train_loss:1.1293 lr:0.0003000000 time/step:184.80s
193
+ [2025-11-29 00:57:45,600] - step:185/900 train_loss:1.1500 lr:0.0003000000 time/step:185.99s
194
+ [2025-11-29 01:00:49,413] - step:186/900 train_loss:1.1429 lr:0.0003000000 time/step:183.79s
195
+ [2025-11-29 01:03:54,362] - step:187/900 train_loss:1.1384 lr:0.0003000000 time/step:184.93s
196
+ [2025-11-29 01:07:01,673] - step:188/900 train_loss:1.1665 lr:0.0003000000 time/step:187.31s
197
+ [2025-11-29 01:10:06,793] - step:189/900 train_loss:1.1470 lr:0.0003000000 time/step:185.10s
198
+ [2025-11-29 01:13:11,822] - step:190/900 train_loss:1.1562 lr:0.0003000000 time/step:185.00s
199
+ [2025-11-29 01:16:16,209] - step:191/900 train_loss:1.1811 lr:0.0003000000 time/step:184.37s
200
+ [2025-11-29 01:19:22,340] - step:192/900 train_loss:1.1471 lr:0.0003000000 time/step:186.13s
201
+ [2025-11-29 01:22:26,519] - step:193/900 train_loss:1.1428 lr:0.0003000000 time/step:184.15s
202
+ [2025-11-29 01:25:31,429] - step:194/900 train_loss:1.1208 lr:0.0003000000 time/step:184.89s
203
+ [2025-11-29 01:28:36,974] - step:195/900 train_loss:1.1308 lr:0.0003000000 time/step:185.54s
204
+ [2025-11-29 01:31:40,544] - step:196/900 train_loss:1.1228 lr:0.0003000000 time/step:183.54s
205
+ [2025-11-29 01:34:45,938] - step:197/900 train_loss:1.1161 lr:0.0003000000 time/step:185.38s
206
+ [2025-11-29 01:37:53,156] - step:198/900 train_loss:1.1478 lr:0.0003000000 time/step:187.21s
207
+ [2025-11-29 01:40:58,171] - step:199/900 train_loss:1.1103 lr:0.0003000000 time/step:184.99s
208
+ [2025-11-29 01:44:05,489] - step:200/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@200.pt...
209
+ [2025-11-29 01:44:05,525] - step:200/900 train_loss:1.1274 lr:0.0003000000 time/step:185.55s
210
+ [2025-11-29 01:47:14,488] - step:201/900 train_loss:1.1234 lr:0.0003000000 time/step:188.94s
211
+ [2025-11-29 01:50:21,411] - step:202/900 train_loss:1.1199 lr:0.0003000000 time/step:186.91s
212
+ [2025-11-29 01:53:26,628] - step:203/900 train_loss:1.0972 lr:0.0003000000 time/step:185.20s
213
+ [2025-11-29 01:56:32,570] - step:204/900 train_loss:1.1371 lr:0.0003000000 time/step:185.92s
214
+ [2025-11-29 01:59:41,140] - step:205/900 train_loss:1.1408 lr:0.0003000000 time/step:188.56s
215
+ [2025-11-29 02:02:45,379] - step:206/900 train_loss:1.0997 lr:0.0003000000 time/step:184.22s
216
+ [2025-11-29 02:05:50,066] - step:207/900 train_loss:1.1332 lr:0.0003000000 time/step:184.67s
217
+ [2025-11-29 02:08:56,311] - step:208/900 train_loss:1.1209 lr:0.0003000000 time/step:186.24s
218
+ [2025-11-29 02:12:00,567] - step:209/900 train_loss:1.0919 lr:0.0003000000 time/step:184.22s
219
+ [2025-11-29 02:15:04,792] - step:210/900 train_loss:1.1005 lr:0.0003000000 time/step:184.22s
220
+ [2025-11-29 02:18:10,654] - step:211/900 train_loss:1.1036 lr:0.0003000000 time/step:185.86s
221
+ [2025-11-29 02:21:14,585] - step:212/900 train_loss:1.1229 lr:0.0003000000 time/step:183.92s
222
+ [2025-11-29 02:24:19,368] - step:213/900 train_loss:1.1051 lr:0.0003000000 time/step:184.77s
223
+ [2025-11-29 02:27:26,145] - step:214/900 train_loss:1.1085 lr:0.0003000000 time/step:186.77s
224
+ [2025-11-29 02:30:30,712] - step:215/900 train_loss:1.0930 lr:0.0003000000 time/step:184.56s
225
+ [2025-11-29 02:33:34,774] - step:216/900 train_loss:1.0977 lr:0.0003000000 time/step:184.05s
226
+ [2025-11-29 02:36:40,292] - step:217/900 train_loss:1.1187 lr:0.0003000000 time/step:185.51s
227
+ [2025-11-29 02:39:49,043] - step:218/900 train_loss:1.0909 lr:0.0003000000 time/step:188.73s
228
+ [2025-11-29 02:42:54,991] - step:219/900 train_loss:1.1056 lr:0.0003000000 time/step:185.90s
229
+ [2025-11-29 02:46:00,394] - step:220/900 train_loss:1.1048 lr:0.0003000000 time/step:185.40s
230
+ [2025-11-29 02:49:07,579] - step:221/900 train_loss:1.1078 lr:0.0003000000 time/step:187.17s
231
+ [2025-11-29 02:52:12,146] - step:222/900 train_loss:1.1114 lr:0.0003000000 time/step:184.54s
232
+ [2025-11-29 02:55:16,480] - step:223/900 train_loss:1.1062 lr:0.0003000000 time/step:184.32s
233
+ [2025-11-29 02:58:22,768] - step:224/900 train_loss:1.1142 lr:0.0003000000 time/step:186.28s
234
+ [2025-11-29 03:01:26,953] - step:225/900 train_loss:1.0961 lr:0.0003000000 time/step:184.17s
235
+ [2025-11-29 03:04:31,749] - step:226/900 train_loss:1.0917 lr:0.0003000000 time/step:184.78s
236
+ [2025-11-29 03:07:38,529] - step:227/900 train_loss:1.0934 lr:0.0003000000 time/step:186.77s
237
+ [2025-11-29 03:10:43,271] - step:228/900 train_loss:1.1069 lr:0.0003000000 time/step:184.70s
238
+ [2025-11-29 03:13:48,167] - step:229/900 train_loss:1.0734 lr:0.0003000000 time/step:184.88s
239
+ [2025-11-29 03:16:52,812] - step:230/900 train_loss:1.0957 lr:0.0003000000 time/step:184.63s
240
+ [2025-11-29 03:19:58,801] - step:231/900 train_loss:1.0775 lr:0.0003000000 time/step:185.98s
241
+ [2025-11-29 03:23:02,987] - step:232/900 train_loss:1.0926 lr:0.0003000000 time/step:184.16s
242
+ [2025-11-29 03:26:08,180] - step:233/900 train_loss:1.1314 lr:0.0003000000 time/step:185.19s
243
+ [2025-11-29 03:29:14,462] - step:234/900 train_loss:1.0868 lr:0.0003000000 time/step:186.28s
244
+ [2025-11-29 03:32:19,081] - step:235/900 train_loss:1.0808 lr:0.0003000000 time/step:184.59s
245
+ [2025-11-29 03:35:24,243] - step:236/900 train_loss:1.0749 lr:0.0003000000 time/step:185.16s
246
+ [2025-11-29 03:38:31,254] - step:237/900 train_loss:1.1269 lr:0.0003000000 time/step:187.01s
247
+ [2025-11-29 03:41:35,966] - step:238/900 train_loss:1.0924 lr:0.0003000000 time/step:184.69s
248
+ [2025-11-29 03:44:41,260] - step:239/900 train_loss:1.0906 lr:0.0003000000 time/step:185.27s
249
+ [2025-11-29 03:47:49,206] - step:240/900 train_loss:1.0918 lr:0.0003000000 time/step:187.94s
250
+ [2025-11-29 03:50:54,694] - step:241/900 train_loss:1.0946 lr:0.0003000000 time/step:185.46s
251
+ [2025-11-29 03:53:59,535] - step:242/900 train_loss:1.1074 lr:0.0003000000 time/step:184.80s
252
+ [2025-11-29 03:57:04,220] - step:243/900 train_loss:1.0943 lr:0.0003000000 time/step:184.67s
253
+ [2025-11-29 04:00:10,432] - step:244/900 train_loss:1.0711 lr:0.0003000000 time/step:186.21s
254
+ [2025-11-29 04:03:15,729] - step:245/900 train_loss:1.1061 lr:0.0003000000 time/step:185.26s
255
+ [2025-11-29 04:06:20,984] - step:246/900 train_loss:1.0789 lr:0.0003000000 time/step:185.24s
256
+ [2025-11-29 04:09:27,749] - step:247/900 train_loss:1.0778 lr:0.0003000000 time/step:186.76s
257
+ [2025-11-29 04:12:33,149] - step:248/900 train_loss:1.0830 lr:0.0003000000 time/step:185.36s
258
+ [2025-11-29 04:15:37,995] - step:249/900 train_loss:1.0921 lr:0.0003000000 time/step:184.84s
259
+ [2025-11-29 04:18:44,391] - step:250/900 train_loss:1.0980 lr:0.0003000000 time/step:186.39s
260
+ [2025-11-29 04:21:49,000] - step:251/900 train_loss:1.0761 lr:0.0003000000 time/step:184.59s
261
+ [2025-11-29 04:24:54,274] - step:252/900 train_loss:1.0901 lr:0.0003000000 time/step:185.25s
262
+ [2025-11-29 04:28:02,058] - step:253/900 train_loss:1.0735 lr:0.0003000000 time/step:187.78s
263
+ [2025-11-29 04:31:09,878] - step:254/900 train_loss:1.0600 lr:0.0003000000 time/step:187.80s
264
+ [2025-11-29 04:34:17,142] - step:255/900 train_loss:1.0544 lr:0.0003000000 time/step:187.23s
265
+ [2025-11-29 04:37:23,181] - step:256/900 train_loss:1.0961 lr:0.0003000000 time/step:186.03s
266
+ [2025-11-29 04:40:31,175] - step:257/900 train_loss:1.0838 lr:0.0003000000 time/step:187.99s
267
+ [2025-11-29 04:43:37,155] - step:258/900 train_loss:1.1142 lr:0.0003000000 time/step:185.74s
268
+ [2025-11-29 04:46:41,531] - step:259/900 train_loss:1.0784 lr:0.0003000000 time/step:184.36s
269
+ [2025-11-29 04:49:47,139] - step:260/900 train_loss:1.0548 lr:0.0003000000 time/step:185.61s
270
+ [2025-11-29 04:52:51,373] - step:261/900 train_loss:1.0670 lr:0.0003000000 time/step:184.18s
271
+ [2025-11-29 04:55:56,540] - step:262/900 train_loss:1.0790 lr:0.0003000000 time/step:185.16s
272
+ [2025-11-29 04:59:03,662] - step:263/900 train_loss:1.0758 lr:0.0003000000 time/step:187.12s
273
+ [2025-11-29 05:02:08,811] - step:264/900 train_loss:1.0945 lr:0.0003000000 time/step:185.14s
274
+ [2025-11-29 05:05:13,852] - step:265/900 train_loss:1.0733 lr:0.0003000000 time/step:185.03s
275
+ [2025-11-29 05:08:20,825] - step:266/900 train_loss:1.0854 lr:0.0003000000 time/step:186.97s
276
+ [2025-11-29 05:11:25,639] - step:267/900 train_loss:1.0816 lr:0.0003000000 time/step:184.80s
277
+ [2025-11-29 05:14:31,022] - step:268/900 train_loss:1.0670 lr:0.0003000000 time/step:185.35s
278
+ [2025-11-29 05:17:35,585] - step:269/900 train_loss:1.0892 lr:0.0003000000 time/step:184.33s
279
+ [2025-11-29 05:20:42,015] - step:270/900 train_loss:1.0245 lr:0.0003000000 time/step:186.43s
280
+ [2025-11-29 05:23:46,422] - step:271/900 train_loss:1.0735 lr:0.0003000000 time/step:184.37s
281
+ [2025-11-29 05:26:50,452] - step:272/900 train_loss:1.0714 lr:0.0003000000 time/step:184.01s
282
+ [2025-11-29 05:29:56,149] - step:273/900 train_loss:1.0769 lr:0.0003000000 time/step:185.68s
283
+ [2025-11-29 05:32:59,582] - step:274/900 train_loss:1.0265 lr:0.0003000000 time/step:183.40s
284
+ [2025-11-29 05:36:04,909] - step:275/900 train_loss:1.0510 lr:0.0003000000 time/step:185.31s
285
+ [2025-11-29 05:39:12,005] - step:276/900 train_loss:1.0753 lr:0.0003000000 time/step:187.07s
286
+ [2025-11-29 05:42:16,993] - step:277/900 train_loss:1.0582 lr:0.0003000000 time/step:184.93s
287
+ [2025-11-29 05:45:22,003] - step:278/900 train_loss:1.0717 lr:0.0003000000 time/step:185.00s
288
+ [2025-11-29 05:48:28,179] - step:279/900 train_loss:1.0676 lr:0.0003000000 time/step:186.16s
289
+ [2025-11-29 05:51:33,621] - step:280/900 train_loss:1.0595 lr:0.0003000000 time/step:185.43s
290
+ [2025-11-29 05:54:38,325] - step:281/900 train_loss:1.0585 lr:0.0003000000 time/step:184.68s
291
+ [2025-11-29 05:57:43,757] - step:282/900 train_loss:1.0949 lr:0.0003000000 time/step:185.43s
292
+ [2025-11-29 06:00:50,769] - step:283/900 train_loss:1.0682 lr:0.0003000000 time/step:187.01s
293
+ [2025-11-29 06:03:55,483] - step:284/900 train_loss:1.0756 lr:0.0003000000 time/step:184.69s
294
+ [2025-11-29 06:07:00,263] - step:285/900 train_loss:1.0693 lr:0.0003000000 time/step:184.77s
295
+ [2025-11-29 06:10:07,073] - step:286/900 train_loss:1.0734 lr:0.0003000000 time/step:186.81s
296
+ [2025-11-29 06:13:12,527] - step:287/900 train_loss:1.0729 lr:0.0003000000 time/step:185.42s
297
+ [2025-11-29 06:16:17,678] - step:288/900 train_loss:1.0483 lr:0.0003000000 time/step:185.12s
298
+ [2025-11-29 06:19:24,289] - step:289/900 train_loss:1.0590 lr:0.0003000000 time/step:186.60s
299
+ [2025-11-29 06:22:30,122] - step:290/900 train_loss:1.0687 lr:0.0003000000 time/step:185.81s
300
+ [2025-11-29 06:25:35,642] - step:291/900 train_loss:1.0612 lr:0.0003000000 time/step:185.50s
301
+ [2025-11-29 06:28:42,491] - step:292/900 train_loss:1.0357 lr:0.0003000000 time/step:186.85s
302
+ [2025-11-29 06:31:49,725] - step:293/900 train_loss:1.0708 lr:0.0003000000 time/step:187.22s
303
+ [2025-11-29 06:34:55,796] - step:294/900 train_loss:1.0707 lr:0.0003000000 time/step:186.05s
304
+ [2025-11-29 06:38:00,778] - step:295/900 train_loss:1.0776 lr:0.0003000000 time/step:184.98s
305
+ [2025-11-29 06:41:07,189] - step:296/900 train_loss:1.0576 lr:0.0003000000 time/step:186.41s
306
+ [2025-11-29 06:44:11,733] - step:297/900 train_loss:1.0260 lr:0.0003000000 time/step:184.49s
307
+ [2025-11-29 06:47:15,871] - step:298/900 train_loss:1.0749 lr:0.0003000000 time/step:184.12s
308
+ [2025-11-29 06:50:21,808] - step:299/900 train_loss:1.0567 lr:0.0003000000 time/step:185.93s
309
+ [2025-11-29 06:53:27,979] - step:300/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@300.pt...
310
+ [2025-11-29 06:53:27,992] - step:300/900 train_loss:1.0667 lr:0.0003000000 time/step:184.41s
311
+ [2025-11-29 06:56:33,223] - step:301/900 train_loss:1.0596 lr:0.0003000000 time/step:185.22s
312
+ [2025-11-29 06:59:39,570] - step:302/900 train_loss:1.0485 lr:0.0003000000 time/step:186.34s
313
+ [2025-11-29 07:02:44,495] - step:303/900 train_loss:1.0444 lr:0.0003000000 time/step:184.92s
314
+ [2025-11-29 07:05:48,939] - step:304/900 train_loss:1.0787 lr:0.0003000000 time/step:184.42s
315
+ [2025-11-29 07:08:55,065] - step:305/900 train_loss:1.0548 lr:0.0003000000 time/step:186.12s
316
+ [2025-11-29 07:12:03,934] - step:306/900 train_loss:1.0604 lr:0.0003000000 time/step:188.86s
317
+ [2025-11-29 07:15:10,343] - step:307/900 train_loss:1.0368 lr:0.0003000000 time/step:186.37s
318
+ [2025-11-29 07:18:15,509] - step:308/900 train_loss:1.0500 lr:0.0003000000 time/step:185.15s
319
+ [2025-11-29 07:21:22,280] - step:309/900 train_loss:1.0519 lr:0.0003000000 time/step:186.76s
320
+ [2025-11-29 07:24:27,690] - step:310/900 train_loss:1.0396 lr:0.0003000000 time/step:185.39s
321
+ [2025-11-29 07:27:32,558] - step:311/900 train_loss:1.0199 lr:0.0003000000 time/step:184.86s
322
+ [2025-11-29 07:30:39,398] - step:312/900 train_loss:1.0318 lr:0.0003000000 time/step:186.83s
323
+ [2025-11-29 07:33:43,716] - step:313/900 train_loss:1.0245 lr:0.0003000000 time/step:184.27s
324
+ [2025-11-29 07:36:48,934] - step:314/900 train_loss:1.0550 lr:0.0003000000 time/step:185.21s
325
+ [2025-11-29 07:39:55,818] - step:315/900 train_loss:1.0384 lr:0.0003000000 time/step:186.88s
326
+ [2025-11-29 07:43:00,630] - step:316/900 train_loss:1.0352 lr:0.0003000000 time/step:184.79s
327
+ [2025-11-29 07:46:05,765] - step:317/900 train_loss:1.0406 lr:0.0003000000 time/step:185.12s
328
+ [2025-11-29 07:49:10,421] - step:318/900 train_loss:1.0438 lr:0.0003000000 time/step:184.64s
329
+ [2025-11-29 07:52:16,687] - step:319/900 train_loss:1.0463 lr:0.0003000000 time/step:186.26s
330
+ [2025-11-29 07:55:21,390] - step:320/900 train_loss:1.0608 lr:0.0003000000 time/step:184.68s
331
+ [2025-11-29 07:58:26,121] - step:321/900 train_loss:1.0704 lr:0.0003000000 time/step:184.70s
332
+ [2025-11-29 08:01:33,062] - step:322/900 train_loss:1.0459 lr:0.0003000000 time/step:186.94s
333
+ [2025-11-29 08:04:39,120] - step:323/900 train_loss:1.0463 lr:0.0003000000 time/step:185.86s
334
+ [2025-11-29 08:07:44,864] - step:324/900 train_loss:1.0497 lr:0.0003000000 time/step:185.73s
335
+ [2025-11-29 08:10:51,704] - step:325/900 train_loss:1.0295 lr:0.0003000000 time/step:186.82s
336
+ [2025-11-29 08:13:56,466] - step:326/900 train_loss:1.0555 lr:0.0003000000 time/step:184.73s
337
+ [2025-11-29 08:17:01,957] - step:327/900 train_loss:1.0380 lr:0.0003000000 time/step:185.49s
338
+ [2025-11-29 08:20:08,723] - step:328/900 train_loss:1.0256 lr:0.0003000000 time/step:186.75s
339
+ [2025-11-29 08:23:14,378] - step:329/900 train_loss:1.0418 lr:0.0003000000 time/step:185.64s
340
+ [2025-11-29 08:26:20,417] - step:330/900 train_loss:1.0660 lr:0.0003000000 time/step:186.01s
341
+ [2025-11-29 08:29:26,557] - step:331/900 train_loss:1.0481 lr:0.0003000000 time/step:186.12s
342
+ [2025-11-29 08:32:43,853] - step:332/900 train_loss:1.0370 lr:0.0003000000 time/step:197.25s
343
+ [2025-11-29 08:35:47,902] - step:333/900 train_loss:1.0556 lr:0.0003000000 time/step:184.02s
344
+ [2025-11-29 08:38:52,834] - step:334/900 train_loss:1.0512 lr:0.0003000000 time/step:184.93s
345
+ [2025-11-29 08:41:58,814] - step:335/900 train_loss:1.0432 lr:0.0003000000 time/step:185.95s
346
+ [2025-11-29 08:45:02,340] - step:336/900 train_loss:1.0165 lr:0.0003000000 time/step:183.51s
347
+ [2025-11-29 08:48:06,475] - step:337/900 train_loss:1.0600 lr:0.0003000000 time/step:184.12s
348
+ [2025-11-29 08:51:14,075] - step:338/900 train_loss:1.0304 lr:0.0003000000 time/step:187.60s
349
+ [2025-11-29 08:54:18,478] - step:339/900 train_loss:1.0187 lr:0.0003000000 time/step:184.37s
350
+ [2025-11-29 08:57:23,665] - step:340/900 train_loss:1.0326 lr:0.0003000000 time/step:185.18s
351
+ [2025-11-29 09:00:30,230] - step:341/900 train_loss:1.0415 lr:0.0003000000 time/step:186.56s
352
+ [2025-11-29 09:03:35,007] - step:342/900 train_loss:1.0413 lr:0.0003000000 time/step:184.75s
353
+ [2025-11-29 09:06:39,120] - step:343/900 train_loss:1.0377 lr:0.0003000000 time/step:184.10s
354
+ [2025-11-29 09:09:43,682] - step:344/900 train_loss:1.0266 lr:0.0003000000 time/step:184.56s
355
+ [2025-11-29 09:12:50,738] - step:345/900 train_loss:1.0305 lr:0.0003000000 time/step:187.04s
356
+ [2025-11-29 09:15:54,975] - step:346/900 train_loss:1.0238 lr:0.0003000000 time/step:184.22s
357
+ [2025-11-29 09:18:59,184] - step:347/900 train_loss:1.0470 lr:0.0003000000 time/step:184.20s
358
+ [2025-11-29 09:22:05,583] - step:348/900 train_loss:1.0343 lr:0.0003000000 time/step:186.39s
359
+ [2025-11-29 09:25:09,502] - step:349/900 train_loss:1.0429 lr:0.0003000000 time/step:183.90s
360
+ [2025-11-29 09:28:14,785] - step:350/900 train_loss:1.0173 lr:0.0003000000 time/step:185.28s
361
+ [2025-11-29 09:31:22,664] - step:351/900 train_loss:1.0260 lr:0.0003000000 time/step:187.87s
362
+ [2025-11-29 09:34:27,994] - step:352/900 train_loss:1.0412 lr:0.0003000000 time/step:185.27s
363
+ [2025-11-29 09:37:33,386] - step:353/900 train_loss:1.0051 lr:0.0003000000 time/step:185.37s
364
+ [2025-11-29 09:40:39,936] - step:354/900 train_loss:1.0386 lr:0.0003000000 time/step:186.55s
365
+ [2025-11-29 09:43:45,796] - step:355/900 train_loss:1.0317 lr:0.0003000000 time/step:185.85s
366
+ [2025-11-29 09:46:51,082] - step:356/900 train_loss:1.0060 lr:0.0003000000 time/step:185.26s
367
+ [2025-11-29 09:49:56,919] - step:357/900 train_loss:1.0267 lr:0.0003000000 time/step:185.82s
368
+ [2025-11-29 09:53:05,845] - step:358/900 train_loss:1.0586 lr:0.0003000000 time/step:188.92s
369
+ [2025-11-29 09:56:13,021] - step:359/900 train_loss:1.0340 lr:0.0003000000 time/step:187.15s
370
+ [2025-11-29 09:59:19,033] - step:360/900 train_loss:1.0385 lr:0.0003000000 time/step:186.00s
371
+ [2025-11-29 10:02:25,949] - step:361/900 train_loss:1.0036 lr:0.0003000000 time/step:186.84s
372
+ [2025-11-29 10:05:30,167] - step:362/900 train_loss:1.0181 lr:0.0003000000 time/step:184.18s
373
+ [2025-11-29 10:08:34,860] - step:363/900 train_loss:1.0245 lr:0.0003000000 time/step:184.69s
374
+ [2025-11-29 10:11:40,819] - step:364/900 train_loss:1.0310 lr:0.0003000000 time/step:185.92s
375
+ [2025-11-29 10:14:44,430] - step:365/900 train_loss:1.0431 lr:0.0003000000 time/step:183.59s
376
+ [2025-11-29 10:17:49,210] - step:366/900 train_loss:1.0010 lr:0.0003000000 time/step:184.77s
377
+ [2025-11-29 10:20:56,812] - step:367/900 train_loss:1.0278 lr:0.0003000000 time/step:187.59s
378
+ [2025-11-29 10:24:03,874] - step:368/900 train_loss:1.0450 lr:0.0003000000 time/step:187.04s
379
+ [2025-11-29 10:27:08,644] - step:369/900 train_loss:1.0187 lr:0.0003000000 time/step:184.76s
380
+ [2025-11-29 10:30:12,932] - step:370/900 train_loss:1.0198 lr:0.0003000000 time/step:184.28s
381
+ [2025-11-29 10:33:19,131] - step:371/900 train_loss:1.0267 lr:0.0003000000 time/step:186.19s
382
+ [2025-11-29 10:36:23,611] - step:372/900 train_loss:1.0050 lr:0.0003000000 time/step:184.44s
383
+ [2025-11-29 10:39:27,504] - step:373/900 train_loss:1.0285 lr:0.0003000000 time/step:183.89s
384
+ [2025-11-29 10:42:34,817] - step:374/900 train_loss:1.0273 lr:0.0003000000 time/step:187.31s
385
+ [2025-11-29 10:45:39,564] - step:375/900 train_loss:1.0304 lr:0.0003000000 time/step:184.73s
386
+ [2025-11-29 10:48:44,710] - step:376/900 train_loss:1.0118 lr:0.0003000000 time/step:185.13s
387
+ [2025-11-29 10:51:52,055] - step:377/900 train_loss:1.0109 lr:0.0003000000 time/step:187.34s
388
+ [2025-11-29 10:54:57,418] - step:378/900 train_loss:1.0240 lr:0.0003000000 time/step:185.34s
389
+ [2025-11-29 10:58:02,656] - step:379/900 train_loss:0.9999 lr:0.0003000000 time/step:185.22s
390
+ [2025-11-29 11:01:08,733] - step:380/900 train_loss:1.0321 lr:0.0003000000 time/step:186.07s
391
+ [2025-11-29 11:04:14,600] - step:381/900 train_loss:1.0227 lr:0.0003000000 time/step:185.85s
392
+ [2025-11-29 11:07:19,868] - step:382/900 train_loss:1.0266 lr:0.0003000000 time/step:185.24s
393
+ [2025-11-29 11:10:25,377] - step:383/900 train_loss:1.0351 lr:0.0003000000 time/step:185.51s
394
+ [2025-11-29 11:13:31,325] - step:384/900 train_loss:1.0345 lr:0.0003000000 time/step:185.94s
395
+ [2025-11-29 11:16:37,117] - step:385/900 train_loss:1.0095 lr:0.0003000000 time/step:185.74s
396
+ [2025-11-29 11:19:42,673] - step:386/900 train_loss:1.0084 lr:0.0003000000 time/step:185.53s
397
+ [2025-11-29 11:22:49,798] - step:387/900 train_loss:1.0363 lr:0.0003000000 time/step:187.11s
398
+ [2025-11-29 11:25:54,407] - step:388/900 train_loss:1.0115 lr:0.0003000000 time/step:184.58s
399
+ [2025-11-29 11:29:01,044] - step:389/900 train_loss:1.0391 lr:0.0003000000 time/step:186.63s
400
+ [2025-11-29 11:32:08,861] - step:390/900 train_loss:1.0325 lr:0.0003000000 time/step:187.81s
401
+ [2025-11-29 11:35:13,748] - step:391/900 train_loss:1.0275 lr:0.0003000000 time/step:184.87s
402
+ [2025-11-29 11:38:19,164] - step:392/900 train_loss:1.0071 lr:0.0003000000 time/step:185.41s
403
+ [2025-11-29 11:41:26,071] - step:393/900 train_loss:1.0140 lr:0.0003000000 time/step:186.89s
404
+ [2025-11-29 11:44:30,887] - step:394/900 train_loss:1.0238 lr:0.0003000000 time/step:184.80s
405
+ [2025-11-29 11:47:36,554] - step:395/900 train_loss:1.0223 lr:0.0003000000 time/step:185.63s
406
+ [2025-11-29 11:50:42,929] - step:396/900 train_loss:1.0248 lr:0.0003000000 time/step:186.36s
407
+ [2025-11-29 11:53:49,516] - step:397/900 train_loss:1.0155 lr:0.0003000000 time/step:186.58s
408
+ [2025-11-29 11:56:55,065] - step:398/900 train_loss:1.0266 lr:0.0003000000 time/step:185.52s
409
+ [2025-11-29 12:00:00,180] - step:399/900 train_loss:0.9997 lr:0.0003000000 time/step:185.11s
410
+ [2025-11-29 12:03:08,327] - step:400/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@400.pt...
411
+ [2025-11-29 12:03:08,330] - step:400/900 train_loss:1.0379 lr:0.0003000000 time/step:186.52s
412
+ [2025-11-29 12:06:13,170] - step:401/900 train_loss:1.0278 lr:0.0003000000 time/step:184.80s
413
+ [2025-11-29 12:09:18,331] - step:402/900 train_loss:0.9898 lr:0.0003000000 time/step:185.15s
414
+ [2025-11-29 12:12:24,948] - step:403/900 train_loss:0.9872 lr:0.0003000000 time/step:186.60s
415
+ [2025-11-29 12:15:30,939] - step:404/900 train_loss:1.0125 lr:0.0003000000 time/step:185.98s
416
+ [2025-11-29 12:18:37,715] - step:405/900 train_loss:1.0320 lr:0.0003000000 time/step:186.75s
417
+ [2025-11-29 12:21:43,437] - step:406/900 train_loss:1.0104 lr:0.0003000000 time/step:185.67s
418
+ [2025-11-29 12:24:46,661] - step:407/900 train_loss:1.0245 lr:0.0003000000 time/step:183.20s
419
+ [2025-11-29 12:27:52,506] - step:408/900 train_loss:1.0147 lr:0.0003000000 time/step:185.84s
420
+ [2025-11-29 12:30:57,358] - step:409/900 train_loss:1.0103 lr:0.0003000000 time/step:184.84s
421
+ [2025-11-29 12:34:03,720] - step:410/900 train_loss:0.9781 lr:0.0003000000 time/step:186.36s
422
+ [2025-11-29 12:37:13,118] - step:411/900 train_loss:0.9906 lr:0.0003000000 time/step:189.35s
423
+ [2025-11-29 12:40:19,894] - step:412/900 train_loss:1.0237 lr:0.0003000000 time/step:186.75s
424
+ [2025-11-29 12:43:25,422] - step:413/900 train_loss:1.0114 lr:0.0003000000 time/step:185.52s
425
+ [2025-11-29 12:46:30,579] - step:414/900 train_loss:1.0147 lr:0.0003000000 time/step:184.99s
426
+ [2025-11-29 12:49:38,876] - step:415/900 train_loss:1.0150 lr:0.0003000000 time/step:188.29s
427
+ [2025-11-29 12:52:43,633] - step:416/900 train_loss:1.0239 lr:0.0003000000 time/step:184.73s
428
+ [2025-11-29 12:55:48,060] - step:417/900 train_loss:1.0036 lr:0.0003000000 time/step:184.39s
429
+ [2025-11-29 12:58:55,753] - step:418/900 train_loss:1.0140 lr:0.0003000000 time/step:187.68s
430
+ [2025-11-29 13:02:00,431] - step:419/900 train_loss:1.0039 lr:0.0003000000 time/step:184.66s
431
+ [2025-11-29 13:05:05,089] - step:420/900 train_loss:1.0203 lr:0.0003000000 time/step:184.64s
432
+ [2025-11-29 13:08:12,316] - step:421/900 train_loss:1.0304 lr:0.0003000000 time/step:187.22s
433
+ [2025-11-29 13:11:17,410] - step:422/900 train_loss:1.0034 lr:0.0003000000 time/step:185.08s
434
+ [2025-11-29 13:14:22,416] - step:423/900 train_loss:1.0279 lr:0.0003000000 time/step:185.00s
435
+ [2025-11-29 13:17:27,732] - step:424/900 train_loss:1.0213 lr:0.0003000000 time/step:185.29s
436
+ [2025-11-29 13:20:34,573] - step:425/900 train_loss:0.9987 lr:0.0003000000 time/step:186.70s
437
+ [2025-11-29 13:23:39,421] - step:426/900 train_loss:0.9673 lr:0.0003000000 time/step:184.84s
438
+ [2025-11-29 13:26:44,176] - step:427/900 train_loss:1.0108 lr:0.0003000000 time/step:184.74s
439
+ [2025-11-29 13:29:52,082] - step:428/900 train_loss:1.0243 lr:0.0003000000 time/step:187.87s
440
+ [2025-11-29 13:32:56,984] - step:429/900 train_loss:0.9843 lr:0.0003000000 time/step:184.88s
441
+ [2025-11-29 13:36:01,659] - step:430/900 train_loss:1.0269 lr:0.0003000000 time/step:184.66s
442
+ [2025-11-29 13:39:09,363] - step:431/900 train_loss:1.0047 lr:0.0003000000 time/step:187.70s
443
+ [2025-11-29 13:42:14,007] - step:432/900 train_loss:0.9957 lr:0.0003000000 time/step:184.63s
444
+ [2025-11-29 13:45:17,936] - step:433/900 train_loss:1.0006 lr:0.0003000000 time/step:183.92s
445
+ [2025-11-29 13:48:23,683] - step:434/900 train_loss:1.0080 lr:0.0003000000 time/step:185.74s
446
+ [2025-11-29 13:51:28,718] - step:435/900 train_loss:1.0033 lr:0.0003000000 time/step:185.01s
447
+ [2025-11-29 13:54:33,479] - step:436/900 train_loss:1.0077 lr:0.0003000000 time/step:184.74s
448
+ [2025-11-29 13:57:38,454] - step:437/900 train_loss:0.9913 lr:0.0003000000 time/step:184.96s
449
+ [2025-11-29 14:00:45,973] - step:438/900 train_loss:1.0221 lr:0.0003000000 time/step:187.50s
450
+ [2025-11-29 14:03:50,970] - step:439/900 train_loss:1.0017 lr:0.0003000000 time/step:184.98s
451
+ [2025-11-29 14:06:56,103] - step:440/900 train_loss:0.9966 lr:0.0003000000 time/step:185.11s
452
+ [2025-11-29 14:10:03,916] - step:441/900 train_loss:1.0023 lr:0.0003000000 time/step:187.81s
453
+ [2025-11-29 14:13:09,854] - step:442/900 train_loss:1.0154 lr:0.0003000000 time/step:185.93s
454
+ [2025-11-29 14:16:13,300] - step:443/900 train_loss:0.9993 lr:0.0003000000 time/step:183.43s
455
+ [2025-11-29 14:19:19,989] - step:444/900 train_loss:1.0085 lr:0.0003000000 time/step:186.68s
456
+ [2025-11-29 14:22:23,752] - step:445/900 train_loss:0.9978 lr:0.0003000000 time/step:183.75s
457
+ [2025-11-29 14:25:27,620] - step:446/900 train_loss:1.0148 lr:0.0003000000 time/step:183.84s
458
+ [2025-11-29 14:28:33,765] - step:447/900 train_loss:0.9874 lr:0.0003000000 time/step:186.14s
459
+ [2025-11-29 14:31:37,881] - step:448/900 train_loss:1.0202 lr:0.0003000000 time/step:184.10s
460
+ [2025-11-29 14:34:41,135] - step:449/900 train_loss:0.9902 lr:0.0003000000 time/step:183.23s
461
+ [2025-11-29 14:37:45,361] - step:450/900 train_loss:1.0036 lr:0.0003000000 time/step:184.22s
462
+ [2025-11-29 14:40:53,203] - step:451/900 train_loss:1.0127 lr:0.0003000000 time/step:187.83s
463
+ [2025-11-29 14:43:58,011] - step:452/900 train_loss:1.0339 lr:0.0003000000 time/step:184.77s
464
+ [2025-11-29 14:47:02,348] - step:453/900 train_loss:0.9934 lr:0.0003000000 time/step:184.30s
465
+ [2025-11-29 14:50:10,497] - step:454/900 train_loss:1.0175 lr:0.0003000000 time/step:188.14s
466
+ [2025-11-29 14:53:14,572] - step:455/900 train_loss:1.0011 lr:0.0003000000 time/step:184.06s
467
+ [2025-11-29 14:56:19,257] - step:456/900 train_loss:1.0329 lr:0.0003000000 time/step:184.66s
468
+ [2025-11-29 14:59:26,311] - step:457/900 train_loss:0.9970 lr:0.0003000000 time/step:187.05s
469
+ [2025-11-29 15:02:31,228] - step:458/900 train_loss:0.9849 lr:0.0003000000 time/step:184.91s
470
+ [2025-11-29 15:05:35,912] - step:459/900 train_loss:1.0443 lr:0.0003000000 time/step:184.67s
471
+ [2025-11-29 15:08:44,234] - step:460/900 train_loss:1.0166 lr:0.0003000000 time/step:188.30s
472
+ [2025-11-29 15:11:49,196] - step:461/900 train_loss:0.9857 lr:0.0003000000 time/step:184.94s
473
+ [2025-11-29 15:14:54,073] - step:462/900 train_loss:0.9887 lr:0.0003000000 time/step:184.87s
474
+ [2025-11-29 15:18:01,015] - step:463/900 train_loss:1.0142 lr:0.0003000000 time/step:186.91s
475
+ [2025-11-29 15:21:10,436] - step:464/900 train_loss:1.0084 lr:0.0003000000 time/step:189.42s
476
+ [2025-11-29 15:24:17,825] - step:465/900 train_loss:1.0079 lr:0.0003000000 time/step:187.37s
477
+ [2025-11-29 15:27:21,991] - step:466/900 train_loss:0.9989 lr:0.0003000000 time/step:184.15s
478
+ [2025-11-29 15:30:29,430] - step:467/900 train_loss:1.0027 lr:0.0003000000 time/step:187.42s
479
+ [2025-11-29 15:33:34,038] - step:468/900 train_loss:0.9864 lr:0.0003000000 time/step:184.56s
480
+ [2025-11-29 15:36:38,606] - step:469/900 train_loss:0.9922 lr:0.0003000000 time/step:184.56s
481
+ [2025-11-29 15:39:50,010] - step:470/900 train_loss:1.0046 lr:0.0003000000 time/step:191.39s
482
+ [2025-11-29 15:42:54,426] - step:471/900 train_loss:0.9947 lr:0.0003000000 time/step:184.39s
483
+ [2025-11-29 15:45:58,386] - step:472/900 train_loss:0.9856 lr:0.0003000000 time/step:183.94s
484
+ [2025-11-29 15:49:06,443] - step:473/900 train_loss:1.0102 lr:0.0003000000 time/step:188.03s
485
+ [2025-11-29 15:52:11,651] - step:474/900 train_loss:0.9815 lr:0.0003000000 time/step:185.17s
486
+ [2025-11-29 15:55:16,024] - step:475/900 train_loss:0.9870 lr:0.0003000000 time/step:184.37s
487
+ [2025-11-29 15:58:20,934] - step:476/900 train_loss:0.9902 lr:0.0003000000 time/step:184.90s
488
+ [2025-11-29 16:01:34,770] - step:477/900 train_loss:1.0044 lr:0.0003000000 time/step:193.83s
489
+ [2025-11-29 16:04:40,969] - step:478/900 train_loss:0.9706 lr:0.0003000000 time/step:186.18s
490
+ [2025-11-29 16:07:46,966] - step:479/900 train_loss:0.9861 lr:0.0003000000 time/step:185.98s
491
+ [2025-11-29 16:11:03,013] - step:480/900 train_loss:1.0035 lr:0.0003000000 time/step:196.03s
492
+ [2025-11-29 16:14:06,891] - step:481/900 train_loss:0.9746 lr:0.0003000000 time/step:183.84s
493
+ [2025-11-29 16:17:10,864] - step:482/900 train_loss:0.9883 lr:0.0003000000 time/step:183.95s
494
+ [2025-11-29 16:20:17,244] - step:483/900 train_loss:1.0245 lr:0.0003000000 time/step:186.37s
495
+ [2025-11-29 16:23:21,291] - step:484/900 train_loss:1.0193 lr:0.0003000000 time/step:184.03s
496
+ [2025-11-29 16:26:24,937] - step:485/900 train_loss:0.9953 lr:0.0003000000 time/step:183.63s
497
+ [2025-11-29 16:29:32,575] - step:486/900 train_loss:0.9787 lr:0.0003000000 time/step:187.63s
498
+ [2025-11-29 16:32:37,230] - step:487/900 train_loss:0.9812 lr:0.0003000000 time/step:184.64s
499
+ [2025-11-29 16:35:41,884] - step:488/900 train_loss:0.9911 lr:0.0003000000 time/step:184.65s
500
+ [2025-11-29 16:38:47,753] - step:489/900 train_loss:0.9665 lr:0.0003000000 time/step:185.84s
501
+ [2025-11-29 16:41:53,739] - step:490/900 train_loss:0.9663 lr:0.0003000000 time/step:185.97s
502
+ [2025-11-29 16:44:58,613] - step:491/900 train_loss:1.0147 lr:0.0003000000 time/step:184.87s
503
+ [2025-11-29 16:48:03,370] - step:492/900 train_loss:1.0107 lr:0.0003000000 time/step:184.74s
504
+ [2025-11-29 16:51:11,045] - step:493/900 train_loss:0.9999 lr:0.0003000000 time/step:187.65s
505
+ [2025-11-29 16:54:15,696] - step:494/900 train_loss:0.9875 lr:0.0003000000 time/step:184.64s
506
+ [2025-11-29 16:57:20,229] - step:495/900 train_loss:0.9990 lr:0.0003000000 time/step:184.53s
507
+ [2025-11-29 17:00:26,562] - step:496/900 train_loss:0.9889 lr:0.0003000000 time/step:186.31s
508
+ [2025-11-29 17:03:30,547] - step:497/900 train_loss:0.9835 lr:0.0003000000 time/step:183.97s
509
+ [2025-11-29 17:06:34,456] - step:498/900 train_loss:1.0062 lr:0.0003000000 time/step:183.89s
510
+ [2025-11-29 17:09:40,945] - step:499/900 train_loss:0.9785 lr:0.0003000000 time/step:186.48s
511
+ [2025-11-29 17:12:49,048] - step:500/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@500.pt...
512
+ [2025-11-29 17:12:49,051] - step:500/900 train_loss:1.0054 lr:0.0003000000 time/step:186.42s
513
+ [2025-11-29 17:15:53,725] - step:501/900 train_loss:0.9961 lr:0.0003000000 time/step:184.67s
514
+ [2025-11-29 17:18:59,020] - step:502/900 train_loss:1.0013 lr:0.0003000000 time/step:185.28s
515
+ [2025-11-29 17:22:06,405] - step:503/900 train_loss:0.9746 lr:0.0003000000 time/step:187.35s
516
+ [2025-11-29 17:25:11,190] - step:504/900 train_loss:0.9977 lr:0.0003000000 time/step:184.76s
517
+ [2025-11-29 17:28:16,344] - step:505/900 train_loss:0.9737 lr:0.0003000000 time/step:185.15s
518
+ [2025-11-29 17:31:24,506] - step:506/900 train_loss:1.0010 lr:0.0003000000 time/step:188.14s
519
+ [2025-11-29 17:34:29,492] - step:507/900 train_loss:0.9852 lr:0.0003000000 time/step:184.96s
520
+ [2025-11-29 17:37:34,522] - step:508/900 train_loss:0.9887 lr:0.0003000000 time/step:185.00s
521
+ [2025-11-29 17:40:41,236] - step:509/900 train_loss:0.9830 lr:0.0003000000 time/step:186.70s
522
+ [2025-11-29 17:43:46,307] - step:510/900 train_loss:0.9844 lr:0.0003000000 time/step:185.05s
523
+ [2025-11-29 17:46:49,527] - step:511/900 train_loss:0.9718 lr:0.0003000000 time/step:183.20s
524
+ [2025-11-29 17:49:52,170] - step:512/900 train_loss:0.9866 lr:0.0003000000 time/step:182.64s
525
+ [2025-11-29 17:52:55,148] - step:513/900 train_loss:1.0106 lr:0.0003000000 time/step:182.96s
526
+ [2025-11-29 17:55:59,252] - step:514/900 train_loss:0.9629 lr:0.0003000000 time/step:184.09s
527
+ [2025-11-29 17:59:04,675] - step:515/900 train_loss:1.0048 lr:0.0003000000 time/step:185.41s
528
+ [2025-11-29 18:02:13,006] - step:516/900 train_loss:0.9964 lr:0.0003000000 time/step:188.32s
529
+ [2025-11-29 18:05:16,855] - step:517/900 train_loss:1.0057 lr:0.0003000000 time/step:183.84s
530
+ [2025-11-29 18:08:20,622] - step:518/900 train_loss:0.9859 lr:0.0003000000 time/step:183.75s
531
+ [2025-11-29 18:11:26,793] - step:519/900 train_loss:0.9714 lr:0.0003000000 time/step:186.16s
532
+ [2025-11-29 18:14:29,889] - step:520/900 train_loss:0.9652 lr:0.0003000000 time/step:183.08s
533
+ [2025-11-29 18:17:33,349] - step:521/900 train_loss:0.9786 lr:0.0003000000 time/step:183.43s
534
+ [2025-11-29 18:20:39,275] - step:522/900 train_loss:0.9721 lr:0.0003000000 time/step:185.92s
535
+ [2025-11-29 18:23:43,034] - step:523/900 train_loss:0.9862 lr:0.0003000000 time/step:183.75s
536
+ [2025-11-29 18:26:46,732] - step:524/900 train_loss:0.9942 lr:0.0003000000 time/step:183.66s
537
+ [2025-11-29 18:29:50,749] - step:525/900 train_loss:0.9850 lr:0.0003000000 time/step:184.01s
538
+ [2025-11-29 18:32:55,277] - step:526/900 train_loss:0.9804 lr:0.0003000000 time/step:184.51s
539
+ [2025-11-29 18:36:00,371] - step:527/900 train_loss:0.9845 lr:0.0003000000 time/step:185.08s
540
+ [2025-11-29 18:39:06,966] - step:528/900 train_loss:0.9832 lr:0.0003000000 time/step:186.57s
541
+ [2025-11-29 18:42:15,798] - step:529/900 train_loss:0.9967 lr:0.0003000000 time/step:188.82s
542
+ [2025-11-29 18:45:22,445] - step:530/900 train_loss:0.9910 lr:0.0003000000 time/step:186.63s
543
+ [2025-11-29 18:48:29,542] - step:531/900 train_loss:0.9714 lr:0.0003000000 time/step:187.07s
544
+ [2025-11-29 18:51:38,501] - step:532/900 train_loss:0.9868 lr:0.0003000000 time/step:188.95s
545
+ [2025-11-29 18:54:45,821] - step:533/900 train_loss:0.9929 lr:0.0003000000 time/step:187.30s
546
+ [2025-11-29 18:57:53,314] - step:534/900 train_loss:0.9879 lr:0.0003000000 time/step:187.47s
547
+ [2025-11-29 19:01:01,583] - step:535/900 train_loss:1.0067 lr:0.0003000000 time/step:188.26s
548
+ [2025-11-29 19:04:09,493] - step:536/900 train_loss:0.9836 lr:0.0003000000 time/step:187.89s
549
+ [2025-11-29 19:07:16,734] - step:537/900 train_loss:0.9868 lr:0.0003000000 time/step:187.21s
550
+ [2025-11-29 19:10:24,993] - step:538/900 train_loss:0.9951 lr:0.0003000000 time/step:188.24s
551
+ [2025-11-29 19:13:31,431] - step:539/900 train_loss:0.9761 lr:0.0003000000 time/step:186.41s
552
+ [2025-11-29 19:16:36,819] - step:540/900 train_loss:0.9742 lr:0.0003000000 time/step:185.38s
553
+ [2025-11-29 19:19:43,799] - step:541/900 train_loss:0.9745 lr:0.0003000000 time/step:186.95s
554
+ [2025-11-29 19:22:52,532] - step:542/900 train_loss:0.9817 lr:0.0003000000 time/step:188.73s
555
+ [2025-11-29 19:25:59,216] - step:543/900 train_loss:0.9777 lr:0.0003000000 time/step:186.67s
556
+ [2025-11-29 19:29:05,849] - step:544/900 train_loss:0.9960 lr:0.0003000000 time/step:186.61s
557
+ [2025-11-29 19:32:14,558] - step:545/900 train_loss:0.9811 lr:0.0003000000 time/step:188.70s
558
+ [2025-11-29 19:35:21,607] - step:546/900 train_loss:0.9882 lr:0.0003000000 time/step:187.02s
559
+ [2025-11-29 19:38:28,518] - step:547/900 train_loss:0.9938 lr:0.0003000000 time/step:186.88s
560
+ [2025-11-29 19:41:37,225] - step:548/900 train_loss:0.9407 lr:0.0003000000 time/step:188.70s
561
+ [2025-11-29 19:44:44,063] - step:549/900 train_loss:0.9774 lr:0.0003000000 time/step:186.81s
562
+ [2025-11-29 19:47:50,816] - step:550/900 train_loss:0.9913 lr:0.0003000000 time/step:186.73s
563
+ [2025-11-29 19:50:58,844] - step:551/900 train_loss:0.9948 lr:0.0003000000 time/step:188.02s
564
+ [2025-11-29 19:54:06,212] - step:552/900 train_loss:0.9696 lr:0.0003000000 time/step:187.35s
565
+ [2025-11-29 19:57:12,084] - step:553/900 train_loss:0.9706 lr:0.0003000000 time/step:185.85s
566
+ [2025-11-29 20:00:18,128] - step:554/900 train_loss:0.9871 lr:0.0003000000 time/step:186.03s
567
+ [2025-11-29 20:03:26,623] - step:555/900 train_loss:0.9930 lr:0.0003000000 time/step:188.48s
568
+ [2025-11-29 20:06:33,230] - step:556/900 train_loss:0.9752 lr:0.0003000000 time/step:186.55s
569
+ [2025-11-29 20:09:39,696] - step:557/900 train_loss:0.9850 lr:0.0003000000 time/step:186.45s
570
+ [2025-11-29 20:12:48,229] - step:558/900 train_loss:0.9720 lr:0.0003000000 time/step:188.52s
571
+ [2025-11-29 20:15:53,987] - step:559/900 train_loss:0.9962 lr:0.0003000000 time/step:185.74s
572
+ [2025-11-29 20:19:00,484] - step:560/900 train_loss:0.9922 lr:0.0003000000 time/step:186.48s
573
+ [2025-11-29 20:22:09,247] - step:561/900 train_loss:0.9740 lr:0.0003000000 time/step:188.74s
574
+ [2025-11-29 20:25:16,473] - step:562/900 train_loss:0.9712 lr:0.0003000000 time/step:187.21s
575
+ [2025-11-29 20:28:23,403] - step:563/900 train_loss:0.9612 lr:0.0003000000 time/step:186.92s
576
+ [2025-11-29 20:31:30,909] - step:564/900 train_loss:0.9914 lr:0.0003000000 time/step:187.50s
577
+ [2025-11-29 20:34:38,710] - step:565/900 train_loss:0.9836 lr:0.0003000000 time/step:187.78s
578
+ [2025-11-29 20:37:45,056] - step:566/900 train_loss:0.9814 lr:0.0003000000 time/step:186.33s
579
+ [2025-11-29 20:40:51,873] - step:567/900 train_loss:0.9865 lr:0.0003000000 time/step:186.81s
580
+ [2025-11-29 20:44:00,559] - step:568/900 train_loss:0.9917 lr:0.0003000000 time/step:188.68s
581
+ [2025-11-29 20:47:07,062] - step:569/900 train_loss:0.9644 lr:0.0003000000 time/step:186.48s
582
+ [2025-11-29 20:50:13,303] - step:570/900 train_loss:0.9759 lr:0.0003000000 time/step:186.19s
583
+ [2025-11-29 20:53:21,695] - step:571/900 train_loss:0.9703 lr:0.0003000000 time/step:188.39s
584
+ [2025-11-29 20:56:29,148] - step:572/900 train_loss:0.9713 lr:0.0003000000 time/step:187.43s
585
+ [2025-11-29 20:59:35,993] - step:573/900 train_loss:0.9549 lr:0.0003000000 time/step:186.82s
586
+ [2025-11-29 21:02:44,463] - step:574/900 train_loss:0.9696 lr:0.0003000000 time/step:188.47s
587
+ [2025-11-29 21:05:51,247] - step:575/900 train_loss:0.9648 lr:0.0003000000 time/step:186.77s
588
+ [2025-11-29 21:08:57,001] - step:576/900 train_loss:0.9695 lr:0.0003000000 time/step:185.74s
589
+ [2025-11-29 21:12:03,873] - step:577/900 train_loss:0.9728 lr:0.0003000000 time/step:186.86s
590
+ [2025-11-29 21:15:10,900] - step:578/900 train_loss:0.9767 lr:0.0003000000 time/step:187.02s
591
+ [2025-11-29 21:18:14,501] - step:579/900 train_loss:0.9643 lr:0.0003000000 time/step:183.56s
592
+ [2025-11-29 21:21:16,045] - step:580/900 train_loss:0.9826 lr:0.0003000000 time/step:181.53s
593
+ [2025-11-29 21:24:19,527] - step:581/900 train_loss:0.9792 lr:0.0003000000 time/step:183.48s
594
+ [2025-11-29 21:27:25,340] - step:582/900 train_loss:0.9852 lr:0.0003000000 time/step:185.73s
595
+ [2025-11-29 21:30:32,498] - step:583/900 train_loss:0.9699 lr:0.0003000000 time/step:187.15s
596
+ [2025-11-29 21:33:40,663] - step:584/900 train_loss:0.9709 lr:0.0003000000 time/step:188.14s
597
+ [2025-11-29 21:36:47,891] - step:585/900 train_loss:0.9673 lr:0.0003000000 time/step:187.21s
598
+ [2025-11-29 21:39:54,798] - step:586/900 train_loss:0.9792 lr:0.0003000000 time/step:186.90s
599
+ [2025-11-29 21:43:04,568] - step:587/900 train_loss:0.9784 lr:0.0003000000 time/step:189.77s
600
+ [2025-11-29 21:46:11,882] - step:588/900 train_loss:0.9719 lr:0.0003000000 time/step:187.29s
601
+ [2025-11-29 21:49:18,906] - step:589/900 train_loss:0.9834 lr:0.0003000000 time/step:187.01s
602
+ [2025-11-29 21:52:25,621] - step:590/900 train_loss:0.9659 lr:0.0003000000 time/step:186.70s
603
+ [2025-11-29 21:55:31,655] - step:591/900 train_loss:0.9658 lr:0.0003000000 time/step:185.94s
604
+ [2025-11-29 21:58:38,212] - step:592/900 train_loss:0.9855 lr:0.0003000000 time/step:186.53s
605
+ [2025-11-29 22:01:44,812] - step:593/900 train_loss:0.9691 lr:0.0003000000 time/step:186.59s
606
+ [2025-11-29 22:04:51,951] - step:594/900 train_loss:0.9781 lr:0.0003000000 time/step:187.13s
607
+ [2025-11-29 22:07:57,915] - step:595/900 train_loss:0.9579 lr:0.0003000000 time/step:185.94s
608
+ [2025-11-29 22:11:04,854] - step:596/900 train_loss:0.9731 lr:0.0003000000 time/step:186.91s
609
+ [2025-11-29 22:14:13,434] - step:597/900 train_loss:0.9715 lr:0.0003000000 time/step:188.57s
610
+ [2025-11-29 22:17:20,910] - step:598/900 train_loss:0.9886 lr:0.0003000000 time/step:187.46s
611
+ [2025-11-29 22:20:27,176] - step:599/900 train_loss:0.9657 lr:0.0003000000 time/step:186.24s
612
+ [2025-11-29 22:23:34,717] - step:600/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@600.pt...
613
+ [2025-11-29 22:23:34,723] - step:600/900 train_loss:0.9532 lr:0.0003000000 time/step:185.95s
614
+ [2025-11-29 22:26:38,518] - step:601/900 train_loss:0.9535 lr:0.0003000000 time/step:183.79s
615
+ [2025-11-29 22:29:41,900] - step:602/900 train_loss:0.9374 lr:0.0003000000 time/step:183.35s
616
+ [2025-11-29 22:32:47,468] - step:603/900 train_loss:0.9662 lr:0.0003000000 time/step:185.52s
617
+ [2025-11-29 22:35:53,752] - step:604/900 train_loss:0.9587 lr:0.0003000000 time/step:186.16s
618
+ [2025-11-29 22:38:58,466] - step:605/900 train_loss:0.9739 lr:0.0003000000 time/step:184.70s
619
+ [2025-11-29 22:42:03,657] - step:606/900 train_loss:0.9563 lr:0.0003000000 time/step:185.17s
620
+ [2025-11-29 22:45:12,058] - step:607/900 train_loss:0.9584 lr:0.0003000000 time/step:188.39s
621
+ [2025-11-29 22:48:18,310] - step:608/900 train_loss:0.9694 lr:0.0003000000 time/step:186.23s
622
+ [2025-11-29 22:51:24,367] - step:609/900 train_loss:0.9681 lr:0.0003000000 time/step:186.05s
623
+ [2025-11-29 22:54:31,573] - step:610/900 train_loss:0.9582 lr:0.0003000000 time/step:187.20s
624
+ [2025-11-29 22:57:36,240] - step:611/900 train_loss:0.9781 lr:0.0003000000 time/step:184.66s
625
+ [2025-11-29 23:00:39,793] - step:612/900 train_loss:0.9707 lr:0.0003000000 time/step:183.54s
626
+ [2025-11-29 23:03:48,177] - step:613/900 train_loss:0.9626 lr:0.0003000000 time/step:188.38s
627
+ [2025-11-29 23:06:54,527] - step:614/900 train_loss:0.9525 lr:0.0003000000 time/step:186.34s
628
+ [2025-11-29 23:10:00,576] - step:615/900 train_loss:0.9825 lr:0.0003000000 time/step:186.03s
629
+ [2025-11-29 23:13:06,944] - step:616/900 train_loss:0.9648 lr:0.0003000000 time/step:186.35s
630
+ [2025-11-29 23:16:13,313] - step:617/900 train_loss:0.9833 lr:0.0003000000 time/step:186.36s
631
+ [2025-11-29 23:19:18,008] - step:618/900 train_loss:0.9619 lr:0.0003000000 time/step:184.67s
632
+ [2025-11-29 23:22:23,418] - step:619/900 train_loss:0.9681 lr:0.0003000000 time/step:185.40s
633
+ [2025-11-29 23:25:30,799] - step:620/900 train_loss:0.9705 lr:0.0003000000 time/step:187.36s
634
+ [2025-11-29 23:28:36,096] - step:621/900 train_loss:0.9884 lr:0.0003000000 time/step:185.28s
635
+ [2025-11-29 23:31:40,935] - step:622/900 train_loss:0.9623 lr:0.0003000000 time/step:184.83s
636
+ [2025-11-29 23:34:49,164] - step:623/900 train_loss:0.9781 lr:0.0003000000 time/step:188.22s
637
+ [2025-11-29 23:37:55,808] - step:624/900 train_loss:0.9558 lr:0.0003000000 time/step:186.62s
638
+ [2025-11-29 23:41:02,902] - step:625/900 train_loss:0.9641 lr:0.0003000000 time/step:187.08s
639
+ [2025-11-29 23:44:12,190] - step:626/900 train_loss:0.9631 lr:0.0003000000 time/step:189.26s
640
+ [2025-11-29 23:47:18,211] - step:627/900 train_loss:0.9820 lr:0.0003000000 time/step:185.99s
641
+ [2025-11-29 23:50:22,907] - step:628/900 train_loss:0.9647 lr:0.0003000000 time/step:184.67s
642
+ [2025-11-29 23:53:29,293] - step:629/900 train_loss:0.9504 lr:0.0003000000 time/step:186.38s
643
+ [2025-11-29 23:56:35,007] - step:630/900 train_loss:0.9845 lr:0.0003000000 time/step:185.70s
644
+ [2025-11-29 23:59:41,063] - step:631/900 train_loss:0.9710 lr:0.0003000000 time/step:186.04s
645
+ [2025-11-30 00:02:47,384] - step:632/900 train_loss:0.9673 lr:0.0003000000 time/step:186.31s
646
+ [2025-11-30 00:05:54,675] - step:633/900 train_loss:0.9644 lr:0.0003000000 time/step:187.29s
647
+ [2025-11-30 00:09:00,681] - step:634/900 train_loss:0.9751 lr:0.0003000000 time/step:185.98s
648
+ [2025-11-30 00:12:07,170] - step:635/900 train_loss:0.9427 lr:0.0003000000 time/step:186.47s
649
+ [2025-11-30 00:15:16,394] - step:636/900 train_loss:0.9941 lr:0.0003000000 time/step:189.21s
650
+ [2025-11-30 00:18:21,885] - step:637/900 train_loss:0.9627 lr:0.0003000000 time/step:185.46s
651
+ [2025-11-30 00:21:26,909] - step:638/900 train_loss:0.9713 lr:0.0003000000 time/step:185.01s
652
+ [2025-11-30 00:24:34,518] - step:639/900 train_loss:0.9477 lr:0.0003000000 time/step:187.59s
653
+ [2025-11-30 00:27:39,860] - step:640/900 train_loss:0.9413 lr:0.0003000000 time/step:185.32s
654
+ [2025-11-30 00:30:46,082] - step:641/900 train_loss:0.9583 lr:0.0003000000 time/step:186.18s
655
+ [2025-11-30 00:33:53,085] - step:642/900 train_loss:0.9927 lr:0.0003000000 time/step:186.99s
656
+ [2025-11-30 00:37:00,236] - step:643/900 train_loss:0.9658 lr:0.0003000000 time/step:187.13s
657
+ [2025-11-30 00:40:06,191] - step:644/900 train_loss:0.9532 lr:0.0003000000 time/step:185.92s
658
+ [2025-11-30 00:43:11,626] - step:645/900 train_loss:0.9510 lr:0.0003000000 time/step:185.43s
659
+ [2025-11-30 00:46:16,854] - step:646/900 train_loss:0.9572 lr:0.0003000000 time/step:185.21s
660
+ [2025-11-30 00:49:20,350] - step:647/900 train_loss:0.9524 lr:0.0003000000 time/step:183.47s
661
+ [2025-11-30 00:52:23,936] - step:648/900 train_loss:0.9724 lr:0.0003000000 time/step:183.58s
662
+ [2025-11-30 00:55:32,534] - step:649/900 train_loss:1.0075 lr:0.0003000000 time/step:188.59s
663
+ [2025-11-30 00:58:37,981] - step:650/900 train_loss:0.9637 lr:0.0003000000 time/step:185.43s
664
+ [2025-11-30 01:01:43,633] - step:651/900 train_loss:0.9657 lr:0.0003000000 time/step:185.63s
665
+ [2025-11-30 01:04:53,089] - step:652/900 train_loss:0.9597 lr:0.0003000000 time/step:189.45s
666
+ [2025-11-30 01:08:00,352] - step:653/900 train_loss:0.9692 lr:0.0003000000 time/step:187.22s
667
+ [2025-11-30 01:11:07,645] - step:654/900 train_loss:0.9529 lr:0.0003000000 time/step:187.28s
668
+ [2025-11-30 01:14:14,239] - step:655/900 train_loss:0.9482 lr:0.0003000000 time/step:186.59s
669
+ [2025-11-30 01:17:20,123] - step:656/900 train_loss:0.9579 lr:0.0003000000 time/step:185.88s
670
+ [2025-11-30 01:20:25,496] - step:657/900 train_loss:0.9504 lr:0.0003000000 time/step:185.35s
671
+ [2025-11-30 01:23:31,180] - step:658/900 train_loss:0.9749 lr:0.0003000000 time/step:185.66s
672
+ [2025-11-30 01:26:37,555] - step:659/900 train_loss:0.9706 lr:0.0003000000 time/step:186.35s
673
+ [2025-11-30 01:29:43,411] - step:660/900 train_loss:0.9571 lr:0.0003000000 time/step:185.84s
674
+ [2025-11-30 01:32:49,562] - step:661/900 train_loss:0.9464 lr:0.0003000000 time/step:186.14s
675
+ [2025-11-30 01:35:57,969] - step:662/900 train_loss:0.9430 lr:0.0003000000 time/step:188.40s
676
+ [2025-11-30 01:39:04,057] - step:663/900 train_loss:0.9606 lr:0.0003000000 time/step:186.06s
677
+ [2025-11-30 01:42:08,918] - step:664/900 train_loss:0.9484 lr:0.0003000000 time/step:184.85s
678
+ [2025-11-30 01:45:15,790] - step:665/900 train_loss:0.9660 lr:0.0003000000 time/step:186.86s
679
+ [2025-11-30 01:48:21,042] - step:666/900 train_loss:0.9715 lr:0.0003000000 time/step:185.22s
680
+ [2025-11-30 01:51:25,399] - step:667/900 train_loss:0.9747 lr:0.0003000000 time/step:184.34s
681
+ [2025-11-30 01:54:31,595] - step:668/900 train_loss:0.9405 lr:0.0003000000 time/step:186.18s
682
+ [2025-11-30 01:57:37,951] - step:669/900 train_loss:0.9562 lr:0.0003000000 time/step:186.34s
683
+ [2025-11-30 02:00:44,059] - step:670/900 train_loss:0.9800 lr:0.0003000000 time/step:186.09s
684
+ [2025-11-30 02:03:49,586] - step:671/900 train_loss:0.9646 lr:0.0003000000 time/step:185.52s
685
+ [2025-11-30 02:06:57,124] - step:672/900 train_loss:0.9656 lr:0.0003000000 time/step:187.53s
686
+ [2025-11-30 02:10:03,956] - step:673/900 train_loss:0.9544 lr:0.0003000000 time/step:186.80s
687
+ [2025-11-30 02:13:09,941] - step:674/900 train_loss:0.9604 lr:0.0003000000 time/step:185.98s
688
+ [2025-11-30 02:16:17,892] - step:675/900 train_loss:0.9639 lr:0.0003000000 time/step:187.95s
689
+ [2025-11-30 02:19:23,974] - step:676/900 train_loss:0.9455 lr:0.0003000000 time/step:186.05s
690
+ [2025-11-30 02:22:30,221] - step:677/900 train_loss:0.9509 lr:0.0003000000 time/step:186.20s
691
+ [2025-11-30 02:25:37,961] - step:678/900 train_loss:0.9363 lr:0.0003000000 time/step:187.73s
692
+ [2025-11-30 02:28:44,267] - step:679/900 train_loss:0.9520 lr:0.0003000000 time/step:186.29s
693
+ [2025-11-30 02:31:50,617] - step:680/900 train_loss:0.9565 lr:0.0003000000 time/step:186.34s
694
+ [2025-11-30 02:34:58,672] - step:681/900 train_loss:0.9727 lr:0.0003000000 time/step:188.04s
695
+ [2025-11-30 02:38:05,140] - step:682/900 train_loss:0.9563 lr:0.0003000000 time/step:186.46s
696
+ [2025-11-30 02:41:09,992] - step:683/900 train_loss:0.9809 lr:0.0003000000 time/step:184.79s
697
+ [2025-11-30 02:44:15,338] - step:684/900 train_loss:0.9526 lr:0.0003000000 time/step:185.34s
698
+ [2025-11-30 02:47:21,385] - step:685/900 train_loss:0.9675 lr:0.0003000000 time/step:186.04s
699
+ [2025-11-30 02:50:25,872] - step:686/900 train_loss:0.9466 lr:0.0003000000 time/step:184.44s
700
+ [2025-11-30 02:53:31,333] - step:687/900 train_loss:0.9575 lr:0.0003000000 time/step:185.43s
701
+ [2025-11-30 02:56:38,782] - step:688/900 train_loss:0.9673 lr:0.0003000000 time/step:187.43s
702
+ [2025-11-30 02:59:44,702] - step:689/900 train_loss:0.9582 lr:0.0003000000 time/step:185.90s
703
+ [2025-11-30 03:02:50,929] - step:690/900 train_loss:0.9581 lr:0.0003000000 time/step:186.22s
704
+ [2025-11-30 03:05:57,119] - step:691/900 train_loss:0.9407 lr:0.0003000000 time/step:186.18s
705
+ [2025-11-30 03:09:02,467] - step:692/900 train_loss:0.9567 lr:0.0003000000 time/step:185.33s
706
+ [2025-11-30 03:12:07,335] - step:693/900 train_loss:0.9362 lr:0.0003000000 time/step:184.84s
707
+ [2025-11-30 03:15:14,078] - step:694/900 train_loss:0.9692 lr:0.0003000000 time/step:186.74s
708
+ [2025-11-30 03:18:20,680] - step:695/900 train_loss:0.9288 lr:0.0003000000 time/step:186.58s
709
+ [2025-11-30 03:21:26,753] - step:696/900 train_loss:0.9616 lr:0.0003000000 time/step:186.05s
710
+ [2025-11-30 03:24:32,423] - step:697/900 train_loss:0.9203 lr:0.0003000000 time/step:185.66s
711
+ [2025-11-30 03:27:41,442] - step:698/900 train_loss:0.9552 lr:0.0003000000 time/step:189.01s
712
+ [2025-11-30 03:30:45,196] - step:699/900 train_loss:0.9601 lr:0.0003000000 time/step:183.72s
713
+ [2025-11-30 03:33:51,669] - step:700/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@700.pt...
714
+ [2025-11-30 03:33:51,671] - step:700/900 train_loss:0.9515 lr:0.0003000000 time/step:184.76s
715
+ [2025-11-30 03:36:59,452] - step:701/900 train_loss:0.9587 lr:0.0003000000 time/step:187.77s
716
+ [2025-11-30 03:40:05,996] - step:702/900 train_loss:0.9688 lr:0.0003000000 time/step:186.44s
717
+ [2025-11-30 03:43:12,080] - step:703/900 train_loss:0.9386 lr:0.0003000000 time/step:186.06s
718
+ [2025-11-30 03:46:19,965] - step:704/900 train_loss:0.9925 lr:0.0003000000 time/step:187.88s
719
+ [2025-11-30 03:49:25,758] - step:705/900 train_loss:0.9425 lr:0.0003000000 time/step:185.77s
720
+ [2025-11-30 03:52:30,961] - step:706/900 train_loss:0.9720 lr:0.0003000000 time/step:185.19s
721
+ [2025-11-30 03:55:39,418] - step:707/900 train_loss:0.9434 lr:0.0003000000 time/step:188.44s
722
+ [2025-11-30 03:58:47,467] - step:708/900 train_loss:0.9549 lr:0.0003000000 time/step:188.03s
723
+ [2025-11-30 04:01:53,611] - step:709/900 train_loss:0.9511 lr:0.0003000000 time/step:186.12s
724
+ [2025-11-30 04:04:58,923] - step:710/900 train_loss:0.9714 lr:0.0003000000 time/step:185.31s
725
+ [2025-11-30 04:08:06,914] - step:711/900 train_loss:0.9647 lr:0.0003000000 time/step:187.98s
726
+ [2025-11-30 04:11:12,929] - step:712/900 train_loss:0.9789 lr:0.0003000000 time/step:185.96s
727
+ [2025-11-30 04:14:19,154] - step:713/900 train_loss:0.9418 lr:0.0003000000 time/step:186.22s
728
+ [2025-11-30 04:17:27,669] - step:714/900 train_loss:0.9417 lr:0.0003000000 time/step:188.50s
729
+ [2025-11-30 04:20:32,769] - step:715/900 train_loss:0.9507 lr:0.0003000000 time/step:185.08s
730
+ [2025-11-30 04:23:37,756] - step:716/900 train_loss:0.9567 lr:0.0003000000 time/step:184.98s
731
+ [2025-11-30 04:26:45,091] - step:717/900 train_loss:0.9389 lr:0.0003000000 time/step:187.32s
732
+ [2025-11-30 04:29:50,043] - step:718/900 train_loss:0.9477 lr:0.0003000000 time/step:184.87s
733
+ [2025-11-30 04:32:53,971] - step:719/900 train_loss:0.9619 lr:0.0003000000 time/step:183.92s
734
+ [2025-11-30 04:36:00,320] - step:720/900 train_loss:0.9533 lr:0.0003000000 time/step:186.34s
735
+ [2025-11-30 04:39:07,896] - step:721/900 train_loss:0.9650 lr:0.0003000000 time/step:187.55s
736
+ [2025-11-30 04:42:13,833] - step:722/900 train_loss:0.9603 lr:0.0003000000 time/step:185.91s
737
+ [2025-11-30 04:45:20,122] - step:723/900 train_loss:0.9604 lr:0.0003000000 time/step:186.28s
738
+ [2025-11-30 04:48:28,513] - step:724/900 train_loss:0.9635 lr:0.0003000000 time/step:188.38s
739
+ [2025-11-30 04:51:34,485] - step:725/900 train_loss:0.9550 lr:0.0003000000 time/step:185.94s
740
+ [2025-11-30 04:54:40,827] - step:726/900 train_loss:0.9679 lr:0.0003000000 time/step:186.34s
741
+ [2025-11-30 04:57:50,319] - step:727/900 train_loss:0.9607 lr:0.0003000000 time/step:189.46s
742
+ [2025-11-30 05:00:56,724] - step:728/900 train_loss:0.9880 lr:0.0003000000 time/step:186.35s
743
+ [2025-11-30 05:04:02,482] - step:729/900 train_loss:0.9358 lr:0.0003000000 time/step:185.75s
744
+ [2025-11-30 05:07:10,367] - step:730/900 train_loss:0.9521 lr:0.0003000000 time/step:187.88s
745
+ [2025-11-30 05:10:16,528] - step:731/900 train_loss:0.9466 lr:0.0003000000 time/step:186.13s
746
+ [2025-11-30 05:13:22,743] - step:732/900 train_loss:0.9481 lr:0.0003000000 time/step:186.21s
747
+ [2025-11-30 05:16:29,572] - step:733/900 train_loss:0.9613 lr:0.0003000000 time/step:186.81s
748
+ [2025-11-30 05:19:37,538] - step:734/900 train_loss:0.9525 lr:0.0003000000 time/step:187.96s
749
+ [2025-11-30 05:22:40,037] - step:735/900 train_loss:0.9457 lr:0.0003000000 time/step:182.48s
750
+ [2025-11-30 05:25:46,040] - step:736/900 train_loss:0.9572 lr:0.0003000000 time/step:185.97s
751
+ [2025-11-30 05:29:04,236] - step:737/900 train_loss:0.9545 lr:0.0003000000 time/step:196.30s
752
+ [2025-11-30 05:32:09,010] - step:738/900 train_loss:0.9633 lr:0.0003000000 time/step:184.76s
753
+ [2025-11-30 05:35:14,741] - step:739/900 train_loss:0.9598 lr:0.0003000000 time/step:185.72s
754
+ [2025-11-30 05:38:21,357] - step:740/900 train_loss:0.9342 lr:0.0003000000 time/step:186.60s
755
+ [2025-11-30 05:41:26,232] - step:741/900 train_loss:0.9550 lr:0.0003000000 time/step:184.84s
756
+ [2025-11-30 05:44:31,223] - step:742/900 train_loss:0.9696 lr:0.0003000000 time/step:184.98s
757
+ [2025-11-30 05:47:42,033] - step:743/900 train_loss:0.9468 lr:0.0003000000 time/step:190.80s
758
+ [2025-11-30 05:50:47,075] - step:744/900 train_loss:0.9588 lr:0.0003000000 time/step:184.98s
759
+ [2025-11-30 05:53:51,033] - step:745/900 train_loss:0.9498 lr:0.0003000000 time/step:183.94s
760
+ [2025-11-30 05:56:57,016] - step:746/900 train_loss:0.9529 lr:0.0003000000 time/step:185.97s
761
+ [2025-11-30 06:00:01,884] - step:747/900 train_loss:0.9376 lr:0.0003000000 time/step:184.84s
762
+ [2025-11-30 06:03:06,392] - step:748/900 train_loss:0.9415 lr:0.0003000000 time/step:184.49s
763
+ [2025-11-30 06:06:13,954] - step:749/900 train_loss:0.9581 lr:0.0003000000 time/step:187.55s
764
+ [2025-11-30 06:09:18,747] - step:750/900 train_loss:0.9494 lr:0.0003000000 time/step:184.77s
765
+ [2025-11-30 06:12:24,279] - step:751/900 train_loss:0.9586 lr:0.0003000000 time/step:185.52s
766
+ [2025-11-30 06:15:30,040] - step:752/900 train_loss:0.9491 lr:0.0003000000 time/step:185.75s
767
+ [2025-11-30 06:18:37,170] - step:753/900 train_loss:0.9585 lr:0.0003000000 time/step:187.12s
768
+ [2025-11-30 06:21:42,398] - step:754/900 train_loss:0.9441 lr:0.0003000000 time/step:185.20s
769
+ [2025-11-30 06:24:48,671] - step:755/900 train_loss:0.9533 lr:0.0003000000 time/step:186.25s
770
+ [2025-11-30 06:27:56,633] - step:756/900 train_loss:0.9433 lr:0.0003000000 time/step:187.94s
771
+ [2025-11-30 06:31:02,691] - step:757/900 train_loss:0.9368 lr:0.0003000000 time/step:186.01s
772
+ [2025-11-30 06:34:08,615] - step:758/900 train_loss:0.9504 lr:0.0003000000 time/step:185.91s
773
+ [2025-11-30 06:37:15,950] - step:759/900 train_loss:0.9412 lr:0.0003000000 time/step:187.31s
774
+ [2025-11-30 06:40:22,539] - step:760/900 train_loss:0.9330 lr:0.0003000000 time/step:186.51s
775
+ [2025-11-30 06:43:28,876] - step:761/900 train_loss:0.9342 lr:0.0003000000 time/step:186.33s
776
+ [2025-11-30 06:46:36,580] - step:762/900 train_loss:0.9329 lr:0.0003000000 time/step:187.68s
777
+ [2025-11-30 06:49:43,404] - step:763/900 train_loss:0.9465 lr:0.0003000000 time/step:186.79s
778
+ [2025-11-30 06:52:49,437] - step:764/900 train_loss:0.9507 lr:0.0003000000 time/step:186.01s
779
+ [2025-11-30 06:55:55,801] - step:765/900 train_loss:0.9754 lr:0.0003000000 time/step:186.35s
780
+ [2025-11-30 06:59:04,165] - step:766/900 train_loss:0.9323 lr:0.0003000000 time/step:188.35s
781
+ [2025-11-30 07:02:09,611] - step:767/900 train_loss:0.9398 lr:0.0003000000 time/step:185.37s
782
+ [2025-11-30 07:05:15,543] - step:768/900 train_loss:0.9773 lr:0.0003000000 time/step:185.92s
783
+ [2025-11-30 07:08:23,040] - step:769/900 train_loss:0.9300 lr:0.0003000000 time/step:187.49s
784
+ [2025-11-30 07:11:27,989] - step:770/900 train_loss:0.9565 lr:0.0003000000 time/step:184.93s
785
+ [2025-11-30 07:14:34,166] - step:771/900 train_loss:0.9791 lr:0.0003000000 time/step:186.17s
786
+ [2025-11-30 07:17:41,334] - step:772/900 train_loss:0.9323 lr:0.0003000000 time/step:187.15s
787
+ [2025-11-30 07:20:48,245] - step:773/900 train_loss:0.9384 lr:0.0003000000 time/step:186.89s
788
+ [2025-11-30 07:23:55,000] - step:774/900 train_loss:0.9620 lr:0.0003000000 time/step:186.75s
789
+ [2025-11-30 07:27:04,805] - step:775/900 train_loss:0.9535 lr:0.0003000000 time/step:189.79s
790
+ [2025-11-30 07:30:11,933] - step:776/900 train_loss:0.9500 lr:0.0003000000 time/step:187.11s
791
+ [2025-11-30 07:33:18,809] - step:777/900 train_loss:0.9556 lr:0.0003000000 time/step:186.84s
792
+ [2025-11-30 07:36:25,083] - step:778/900 train_loss:0.9280 lr:0.0003000000 time/step:186.27s
793
+ [2025-11-30 07:39:34,387] - step:779/900 train_loss:0.9373 lr:0.0003000000 time/step:189.30s
794
+ [2025-11-30 07:42:40,512] - step:780/900 train_loss:0.9556 lr:0.0003000000 time/step:186.10s
795
+ [2025-11-30 07:45:46,330] - step:781/900 train_loss:0.9568 lr:0.0003000000 time/step:185.80s
796
+ [2025-11-30 07:48:53,453] - step:782/900 train_loss:0.9737 lr:0.0003000000 time/step:187.11s
797
+ [2025-11-30 07:51:59,918] - step:783/900 train_loss:0.9267 lr:0.0003000000 time/step:186.44s
798
+ [2025-11-30 07:55:06,653] - step:784/900 train_loss:0.9683 lr:0.0003000000 time/step:186.73s
799
+ [2025-11-30 07:58:14,271] - step:785/900 train_loss:0.9249 lr:0.0003000000 time/step:187.60s
800
+ [2025-11-30 08:01:21,688] - step:786/900 train_loss:0.9586 lr:0.0003000000 time/step:187.32s
801
+ [2025-11-30 08:04:28,087] - step:787/900 train_loss:0.9470 lr:0.0003000000 time/step:186.39s
802
+ [2025-11-30 08:07:34,899] - step:788/900 train_loss:0.9591 lr:0.0003000000 time/step:186.79s
803
+ [2025-11-30 08:10:41,723] - step:789/900 train_loss:0.9433 lr:0.0003000000 time/step:186.81s
804
+ [2025-11-30 08:13:47,670] - step:790/900 train_loss:0.9496 lr:0.0003000000 time/step:185.92s
805
+ [2025-11-30 08:16:53,831] - step:791/900 train_loss:0.9459 lr:0.0003000000 time/step:186.15s
806
+ [2025-11-30 08:20:01,102] - step:792/900 train_loss:0.9601 lr:0.0003000000 time/step:187.25s
807
+ [2025-11-30 08:23:06,763] - step:793/900 train_loss:0.9408 lr:0.0003000000 time/step:185.64s
808
+ [2025-11-30 08:26:12,187] - step:794/900 train_loss:0.9571 lr:0.0003000000 time/step:185.41s
809
+ [2025-11-30 08:29:18,964] - step:795/900 train_loss:0.9670 lr:0.0003000000 time/step:186.77s
810
+ [2025-11-30 08:32:24,852] - step:796/900 train_loss:0.9432 lr:0.0003000000 time/step:185.86s
811
+ [2025-11-30 08:35:30,345] - step:797/900 train_loss:0.9347 lr:0.0003000000 time/step:185.49s
812
+ [2025-11-30 08:38:37,005] - step:798/900 train_loss:0.9431 lr:0.0003000000 time/step:186.65s
813
+ [2025-11-30 08:41:44,291] - step:799/900 train_loss:0.9548 lr:0.0003000000 time/step:187.24s
814
+ [2025-11-30 08:44:52,246] - step:800/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@800.pt...
815
+ [2025-11-30 08:44:52,247] - step:800/900 train_loss:0.9472 lr:0.0003000000 time/step:186.19s
816
+ [2025-11-30 08:47:59,432] - step:801/900 train_loss:0.9580 lr:0.0003000000 time/step:187.17s
817
+ [2025-11-30 08:51:07,677] - step:802/900 train_loss:0.9347 lr:0.0003000000 time/step:188.21s
818
+ [2025-11-30 08:54:13,610] - step:803/900 train_loss:0.9552 lr:0.0003000000 time/step:185.91s
819
+ [2025-11-30 08:57:20,822] - step:804/900 train_loss:0.9433 lr:0.0003000000 time/step:187.20s
820
+ [2025-11-30 09:00:27,788] - step:805/900 train_loss:0.9725 lr:0.0003000000 time/step:186.95s
821
+ [2025-11-30 09:03:33,451] - step:806/900 train_loss:0.9319 lr:0.0003000000 time/step:185.63s
822
+ [2025-11-30 09:06:38,728] - step:807/900 train_loss:0.9416 lr:0.0003000000 time/step:185.26s
823
+ [2025-11-30 09:09:45,243] - step:808/900 train_loss:0.9305 lr:0.0003000000 time/step:186.49s
824
+ [2025-11-30 09:12:51,277] - step:809/900 train_loss:0.9611 lr:0.0003000000 time/step:186.01s
825
+ [2025-11-30 09:15:57,601] - step:810/900 train_loss:0.9333 lr:0.0003000000 time/step:186.32s
826
+ [2025-11-30 09:19:05,758] - step:811/900 train_loss:0.9224 lr:0.0003000000 time/step:188.14s
827
+ [2025-11-30 09:22:13,002] - step:812/900 train_loss:0.9311 lr:0.0003000000 time/step:187.20s
828
+ [2025-11-30 09:25:19,895] - step:813/900 train_loss:0.9344 lr:0.0003000000 time/step:186.89s
829
+ [2025-11-30 09:28:27,220] - step:814/900 train_loss:0.9558 lr:0.0003000000 time/step:187.31s
830
+ [2025-11-30 09:31:34,977] - step:815/900 train_loss:0.9603 lr:0.0003000000 time/step:187.65s
831
+ [2025-11-30 09:34:41,403] - step:816/900 train_loss:0.9405 lr:0.0003000000 time/step:186.42s
832
+ [2025-11-30 09:37:48,995] - step:817/900 train_loss:0.9620 lr:0.0003000000 time/step:187.58s
833
+ [2025-11-30 09:40:56,468] - step:818/900 train_loss:0.9415 lr:0.0003000000 time/step:187.44s
834
+ [2025-11-30 09:44:02,893] - step:819/900 train_loss:0.9391 lr:0.0003000000 time/step:186.41s
835
+ [2025-11-30 09:47:08,977] - step:820/900 train_loss:0.9551 lr:0.0003000000 time/step:186.08s
836
+ [2025-11-30 09:50:18,823] - step:821/900 train_loss:0.9585 lr:0.0003000000 time/step:189.84s
837
+ [2025-11-30 09:53:24,812] - step:822/900 train_loss:0.9449 lr:0.0003000000 time/step:185.94s
838
+ [2025-11-30 09:56:30,627] - step:823/900 train_loss:0.9446 lr:0.0003000000 time/step:185.81s
839
+ [2025-11-30 09:59:38,002] - step:824/900 train_loss:0.9589 lr:0.0003000000 time/step:187.37s
840
+ [2025-11-30 10:02:45,105] - step:825/900 train_loss:0.9660 lr:0.0003000000 time/step:187.08s
841
+ [2025-11-30 10:05:50,874] - step:826/900 train_loss:0.9403 lr:0.0003000000 time/step:185.75s
842
+ [2025-11-30 10:08:57,938] - step:827/900 train_loss:0.9435 lr:0.0003000000 time/step:187.06s
843
+ [2025-11-30 10:12:06,575] - step:828/900 train_loss:0.9462 lr:0.0003000000 time/step:188.61s
844
+ [2025-11-30 10:15:12,880] - step:829/900 train_loss:0.9383 lr:0.0003000000 time/step:186.30s
845
+ [2025-11-30 10:18:19,887] - step:830/900 train_loss:0.9513 lr:0.0003000000 time/step:187.00s
846
+ [2025-11-30 10:21:26,611] - step:831/900 train_loss:0.9434 lr:0.0003000000 time/step:186.71s
847
+ [2025-11-30 10:24:32,173] - step:832/900 train_loss:0.9277 lr:0.0003000000 time/step:185.55s
848
+ [2025-11-30 10:27:38,320] - step:833/900 train_loss:0.9638 lr:0.0003000000 time/step:186.13s
849
+ [2025-11-30 10:30:45,530] - step:834/900 train_loss:0.9344 lr:0.0003000000 time/step:187.17s
850
+ [2025-11-30 10:33:51,305] - step:835/900 train_loss:0.9318 lr:0.0003000000 time/step:185.76s
851
+ [2025-11-30 10:36:56,815] - step:836/900 train_loss:0.9660 lr:0.0003000000 time/step:185.50s
852
+ [2025-11-30 10:40:03,772] - step:837/900 train_loss:0.9189 lr:0.0003000000 time/step:186.95s
853
+ [2025-11-30 10:43:10,385] - step:838/900 train_loss:0.9294 lr:0.0003000000 time/step:186.60s
854
+ [2025-11-30 10:46:16,106] - step:839/900 train_loss:0.9562 lr:0.0003000000 time/step:185.71s
855
+ [2025-11-30 10:49:23,701] - step:840/900 train_loss:0.9308 lr:0.0003000000 time/step:187.59s
856
+ [2025-11-30 10:52:31,109] - step:841/900 train_loss:0.9446 lr:0.0003000000 time/step:187.37s
857
+ [2025-11-30 10:55:36,766] - step:842/900 train_loss:0.9646 lr:0.0003000000 time/step:185.64s
858
+ [2025-11-30 10:58:43,991] - step:843/900 train_loss:0.9662 lr:0.0003000000 time/step:187.22s
859
+ [2025-11-30 11:01:51,907] - step:844/900 train_loss:0.9557 lr:0.0003000000 time/step:187.91s
860
+ [2025-11-30 11:04:58,972] - step:845/900 train_loss:0.9409 lr:0.0003000000 time/step:187.04s
861
+ [2025-11-30 11:08:05,417] - step:846/900 train_loss:0.9277 lr:0.0003000000 time/step:186.44s
862
+ [2025-11-30 11:11:12,807] - step:847/900 train_loss:0.9310 lr:0.0003000000 time/step:187.37s
863
+ [2025-11-30 11:14:18,599] - step:848/900 train_loss:0.9528 lr:0.0003000000 time/step:185.78s
864
+ [2025-11-30 11:17:24,283] - step:849/900 train_loss:0.9435 lr:0.0003000000 time/step:185.67s
865
+ [2025-11-30 11:20:29,958] - step:850/900 train_loss:0.9328 lr:0.0003000000 time/step:185.67s
866
+ [2025-11-30 11:23:37,238] - step:851/900 train_loss:0.9586 lr:0.0003000000 time/step:187.25s
867
+ [2025-11-30 11:26:43,509] - step:852/900 train_loss:0.9788 lr:0.0003000000 time/step:186.26s
868
+ [2025-11-30 11:29:50,577] - step:853/900 train_loss:0.9598 lr:0.0003000000 time/step:187.04s
869
+ [2025-11-30 11:32:58,592] - step:854/900 train_loss:0.9314 lr:0.0003000000 time/step:187.98s
870
+ [2025-11-30 11:36:05,368] - step:855/900 train_loss:0.9431 lr:0.0003000000 time/step:186.76s
871
+ [2025-11-30 11:39:14,068] - step:856/900 train_loss:0.9402 lr:0.0003000000 time/step:188.69s
872
+ [2025-11-30 11:42:21,149] - step:857/900 train_loss:0.9406 lr:0.0003000000 time/step:187.03s
873
+ [2025-11-30 11:45:27,269] - step:858/900 train_loss:0.9517 lr:0.0003000000 time/step:186.10s
874
+ [2025-11-30 11:48:33,589] - step:859/900 train_loss:0.9288 lr:0.0003000000 time/step:186.29s
875
+ [2025-11-30 11:51:41,526] - step:860/900 train_loss:0.9489 lr:0.0003000000 time/step:187.92s
876
+ [2025-11-30 11:54:48,310] - step:861/900 train_loss:0.9242 lr:0.0003000000 time/step:186.76s
877
+ [2025-11-30 11:57:55,433] - step:862/900 train_loss:0.9465 lr:0.0003000000 time/step:187.12s
878
+ [2025-11-30 12:01:02,214] - step:863/900 train_loss:0.9319 lr:0.0003000000 time/step:186.77s
879
+ [2025-11-30 12:04:10,157] - step:864/900 train_loss:0.9561 lr:0.0003000000 time/step:187.93s
880
+ [2025-11-30 12:07:16,580] - step:865/900 train_loss:0.9531 lr:0.0003000000 time/step:186.41s
881
+ [2025-11-30 12:10:24,225] - step:866/900 train_loss:0.9716 lr:0.0003000000 time/step:187.64s
882
+ [2025-11-30 12:13:32,116] - step:867/900 train_loss:0.9523 lr:0.0003000000 time/step:187.86s
883
+ [2025-11-30 12:16:38,751] - step:868/900 train_loss:0.9485 lr:0.0003000000 time/step:186.63s
884
+ [2025-11-30 12:19:46,482] - step:869/900 train_loss:0.9338 lr:0.0003000000 time/step:187.71s
885
+ [2025-11-30 12:22:55,768] - step:870/900 train_loss:0.9071 lr:0.0003000000 time/step:189.25s
886
+ [2025-11-30 12:26:03,017] - step:871/900 train_loss:0.9349 lr:0.0003000000 time/step:187.24s
887
+ [2025-11-30 12:29:11,277] - step:872/900 train_loss:0.9171 lr:0.0003000000 time/step:188.25s
888
+ [2025-11-30 12:32:17,975] - step:873/900 train_loss:0.9291 lr:0.0003000000 time/step:186.69s
889
+ [2025-11-30 12:35:23,652] - step:874/900 train_loss:0.9496 lr:0.0003000000 time/step:185.66s
890
+ [2025-11-30 12:38:30,634] - step:875/900 train_loss:0.9004 lr:0.0003000000 time/step:186.96s
891
+ [2025-11-30 12:41:36,335] - step:876/900 train_loss:0.9638 lr:0.0003000000 time/step:185.69s
892
+ [2025-11-30 12:44:43,698] - step:877/900 train_loss:0.9303 lr:0.0003000000 time/step:187.35s
893
+ [2025-11-30 12:47:49,883] - step:878/900 train_loss:0.9308 lr:0.0003000000 time/step:186.17s
894
+ [2025-11-30 12:50:57,119] - step:879/900 train_loss:0.9567 lr:0.0003000000 time/step:187.23s
895
+ [2025-11-30 12:54:05,570] - step:880/900 train_loss:0.9294 lr:0.0003000000 time/step:188.35s
896
+ [2025-11-30 12:57:11,908] - step:881/900 train_loss:0.9243 lr:0.0003000000 time/step:186.32s
897
+ [2025-11-30 13:00:18,512] - step:882/900 train_loss:0.9372 lr:0.0003000000 time/step:186.59s
898
+ [2025-11-30 13:03:25,956] - step:883/900 train_loss:0.9677 lr:0.0003000000 time/step:187.41s
899
+ [2025-11-30 13:06:32,030] - step:884/900 train_loss:0.9502 lr:0.0003000000 time/step:186.05s
900
+ [2025-11-30 13:09:38,746] - step:885/900 train_loss:0.9309 lr:0.0003000000 time/step:186.69s
901
+ [2025-11-30 13:12:45,777] - step:886/900 train_loss:0.9468 lr:0.0003000000 time/step:186.96s
902
+ [2025-11-30 13:15:51,552] - step:887/900 train_loss:0.9319 lr:0.0003000000 time/step:185.76s
903
+ [2025-11-30 13:18:58,191] - step:888/900 train_loss:0.9400 lr:0.0003000000 time/step:186.63s
904
+ [2025-11-30 13:22:04,951] - step:889/900 train_loss:0.9518 lr:0.0003000000 time/step:186.75s
905
+ [2025-11-30 13:25:13,370] - step:890/900 train_loss:0.9375 lr:0.0003000000 time/step:188.37s
906
+ [2025-11-30 13:28:19,675] - step:891/900 train_loss:0.9699 lr:0.0003000000 time/step:186.29s
907
+ [2025-11-30 13:31:27,143] - step:892/900 train_loss:0.9479 lr:0.0003000000 time/step:187.46s
908
+ [2025-11-30 13:34:34,338] - step:893/900 train_loss:0.9351 lr:0.0003000000 time/step:187.14s
909
+ [2025-11-30 13:37:40,472] - step:894/900 train_loss:0.9767 lr:0.0003000000 time/step:186.13s
910
+ [2025-11-30 13:40:48,083] - step:895/900 train_loss:0.9475 lr:0.0003000000 time/step:187.60s
911
+ [2025-11-30 13:43:55,818] - step:896/900 train_loss:0.9617 lr:0.0003000000 time/step:187.71s
912
+ [2025-11-30 13:47:01,872] - step:897/900 train_loss:0.9549 lr:0.0003000000 time/step:186.04s
913
+ [2025-11-30 13:50:09,170] - step:898/900 train_loss:0.9324 lr:0.0003000000 time/step:187.29s
914
+ [2025-11-30 13:53:18,377] - step:899/900 train_loss:0.9573 lr:0.0003000000 time/step:189.17s
915
+ [2025-11-30 13:56:26,957] - step:900/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@900.pt...
916
+ [2025-11-30 13:56:26,959] - step:900/900 train_loss:0.9387 lr:0.0003000000 time/step:186.63s
wandb/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-11-28T15:19:48.477578816Z","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug-core.log"}
2
+ {"time":"2025-11-28T15:19:48.699009069Z","level":"INFO","msg":"created new stream","id":"j8dmy8fe"}
3
+ {"time":"2025-11-28T15:19:48.699097403Z","level":"INFO","msg":"stream: started","id":"j8dmy8fe"}
4
+ {"time":"2025-11-28T15:19:48.699172779Z","level":"INFO","msg":"writer: Do: started","stream_id":"j8dmy8fe"}
5
+ {"time":"2025-11-28T15:19:48.699248275Z","level":"INFO","msg":"handler: started","stream_id":"j8dmy8fe"}
6
+ {"time":"2025-11-28T15:19:48.699271222Z","level":"INFO","msg":"sender: started","stream_id":"j8dmy8fe"}
7
+ {"time":"2025-11-28T15:19:49.160995748Z","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-11-28T20:38:35.352286792Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
9
+ {"time":"2025-11-30T13:56:28.774510068Z","level":"INFO","msg":"stream: closing","id":"j8dmy8fe"}
10
+ {"time":"2025-11-30T13:56:28.784684257Z","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2025-11-30T13:56:29.045992888Z","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2025-11-30T13:56:30.296345529Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-11-30T13:56:30.566218828Z","level":"INFO","msg":"handler: closed","stream_id":"j8dmy8fe"}
14
+ {"time":"2025-11-30T13:56:30.566263739Z","level":"INFO","msg":"writer: Close: closed","stream_id":"j8dmy8fe"}
15
+ {"time":"2025-11-30T13:56:30.573657463Z","level":"INFO","msg":"sender: closed","stream_id":"j8dmy8fe"}
16
+ {"time":"2025-11-30T13:56:30.573732716Z","level":"INFO","msg":"stream: closed","id":"j8dmy8fe"}
wandb/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Configure stats pid to 3738330
3
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from /home/agrivas/.config/wandb/settings
4
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/settings
5
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug.log
7
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug-internal.log
8
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:init():761] calling init triggers
9
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'compile': True, 'device': 'cuda', 'from_checkpoint': None, 'load_mtp_head_from_model': None, 'name': 'nanogpt', 'training': {'random_seed': 13, 'batch_size': 256, 'device_batch_size': 1, 'sequence_length': 8192, 'num_iterations': 900, 'learning_rate': 0.0003, 'use_scheduler': False, 'save_model': True, 'save_optimizer': True, 'save_model_every': 100, 'val_loss_every': 100, 'val_tokens': 4194304, 'expname': 'llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1'}, 'model': {'name': 'mtp', 'beta': 0.0, 'gamma': 1, 'kl_algorithm': 'full', 'kl_type': 'forward', 'model': {'_target_': 'mtp.models.mtp.MultiTokenLM', 'lm': '${lm.model}', 'circuit': '${circuit.model}', 'mt_head_kwargs': '${mt_head.hyperparameters}', 'init_from_lm_head': True, 'kl_type': '${model.kl_type}', 'kl_algorithm': '${model.kl_algorithm}', 'beta': 0, 'gamma': 0.9}}, 'circuit': {'name': 'btree', 'n_token': 8, 'n_component': 32, 'n_repetition': 1, 'model': {'_target_': 'mtp.models.circuits.CircuitModel', 'vocab_size': 268, 'n_token': 8, 'n_component': 32, 'n_repetition': 1, 'kind': 'btree'}}, 'mt_head': {'name': 'linear-evabyte', 'hyperparameters': {'type': 'evabyte', 'n_embd': 3072, 'transformer_n_head': 24, 'transformer_n_layer': 0, 'expander_type': 'linear', 'expander_n_layer': 1, 'freeze_vocab_unembedding': False, 'share_sum_weights': False, 'contextual_hmm_weights': True, 'init_hmm_identity': True}}, 'adaptor': {'name': 'none', 'hyperparameters': None}, 'lm': {'name': 'llama3-2-3b-byte', 'n_embd': 3072, 'n_head': 24, 'model': {'_target_': 'mtp.models.lm.LM', 'lm': None, 'encoder_only': True, 'from_checkpoint': None, 'from_huggingface': 'benjamin/Llama3-2-3B-IT-Byte', 'adaptor_kwargs': None, 'ref_enc': 'model', 'ref_head': 'lm_head', 'freeze': True}}, 'data': {'name': 'tulu3-llama3', 'train_bin': 'agrv/tulu-v3-sft-llama3-packed-seq-len-8192', 'val_bin': None, 'vocab_size': 268}, 'generate': {'speculative': False}, '_wandb': {}}
11
+ 2025-11-28 15:19:48,445 INFO MainThread:3738330 [wandb_init.py:init():784] starting backend
12
+ 2025-11-28 15:19:48,445 INFO MainThread:3738330 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-11-28 15:19:48,469 INFO MainThread:3738330 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-11-28 15:19:48,469 INFO MainThread:3738330 [wandb_init.py:init():798] backend started and connected
15
+ 2025-11-28 15:19:48,474 INFO MainThread:3738330 [wandb_init.py:init():891] updated telemetry
16
+ 2025-11-28 15:19:48,496 INFO MainThread:3738330 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-11-28 15:19:49,156 INFO MainThread:3738330 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-11-28 15:19:49,910 INFO MainThread:3738330 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-11-28 15:19:49,910 INFO MainThread:3738330 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-11-28 15:19:49,915 INFO MainThread:3738330 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-11-28 15:19:49,915 INFO MainThread:3738330 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-11-28 15:19:49,940 INFO MainThread:3738330 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-11-30 13:56:28,347 INFO MsgRouterThr:3738330 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
wandb/run-20251128_151948-j8dmy8fe/files/config.yaml ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.8
4
+ m:
5
+ - "1": train/ce_loss_at_7
6
+ "5": 2
7
+ "6":
8
+ - 1
9
+ - 3
10
+ "7": []
11
+ - "1": global_step
12
+ "5": 2
13
+ "6":
14
+ - 1
15
+ - 3
16
+ "7": []
17
+ - "1": train/ce_loss_at_8
18
+ "5": 2
19
+ "6":
20
+ - 1
21
+ - 3
22
+ "7": []
23
+ - "1": train/ce_loss_at_4
24
+ "5": 2
25
+ "6":
26
+ - 1
27
+ - 3
28
+ "7": []
29
+ - "1": train/ce_loss_at_5
30
+ "5": 2
31
+ "6":
32
+ - 1
33
+ - 3
34
+ "7": []
35
+ - "1": train/ce_loss_at_6
36
+ "5": 2
37
+ "6":
38
+ - 1
39
+ - 3
40
+ "7": []
41
+ - "1": train/loss
42
+ "5": 2
43
+ "6":
44
+ - 1
45
+ - 3
46
+ "7": []
47
+ - "1": train/ce_loss_at_1
48
+ "5": 2
49
+ "6":
50
+ - 1
51
+ - 3
52
+ "7": []
53
+ - "1": train/ce_loss_at_2
54
+ "5": 2
55
+ "6":
56
+ - 1
57
+ - 3
58
+ "7": []
59
+ - "1": train/ce_loss_at_3
60
+ "5": 2
61
+ "6":
62
+ - 1
63
+ - 3
64
+ "7": []
65
+ python_version: 3.10.16
66
+ t:
67
+ "1":
68
+ - 1
69
+ - 11
70
+ - 41
71
+ - 49
72
+ - 50
73
+ - 51
74
+ - 55
75
+ - 71
76
+ - 84
77
+ - 98
78
+ "2":
79
+ - 1
80
+ - 11
81
+ - 41
82
+ - 49
83
+ - 50
84
+ - 51
85
+ - 55
86
+ - 71
87
+ - 84
88
+ - 98
89
+ "3":
90
+ - 7
91
+ - 13
92
+ - 15
93
+ - 16
94
+ - 23
95
+ - 55
96
+ "4": 3.10.16
97
+ "5": 0.19.8
98
+ "6": 4.49.0
99
+ "8":
100
+ - 5
101
+ "12": 0.19.8
102
+ "13": linux-x86_64
103
+ adaptor:
104
+ value:
105
+ hyperparameters: null
106
+ name: none
107
+ circuit:
108
+ value:
109
+ model:
110
+ _target_: mtp.models.circuits.CircuitModel
111
+ kind: btree
112
+ n_component: 32
113
+ n_repetition: 1
114
+ n_token: 8
115
+ vocab_size: 268
116
+ n_component: 32
117
+ n_repetition: 1
118
+ n_token: 8
119
+ name: btree
120
+ compile:
121
+ value: true
122
+ data:
123
+ value:
124
+ name: tulu3-llama3
125
+ train_bin: agrv/tulu-v3-sft-llama3-packed-seq-len-8192
126
+ val_bin: null
127
+ vocab_size: 268
128
+ device:
129
+ value: cuda
130
+ from_checkpoint:
131
+ value: null
132
+ generate:
133
+ value:
134
+ speculative: false
135
+ lm:
136
+ value:
137
+ model:
138
+ _target_: mtp.models.lm.LM
139
+ adaptor_kwargs: null
140
+ encoder_only: true
141
+ freeze: true
142
+ from_checkpoint: null
143
+ from_huggingface: benjamin/Llama3-2-3B-IT-Byte
144
+ lm: null
145
+ ref_enc: model
146
+ ref_head: lm_head
147
+ n_embd: 3072
148
+ n_head: 24
149
+ name: llama3-2-3b-byte
150
+ load_mtp_head_from_model:
151
+ value: null
152
+ model:
153
+ value:
154
+ beta: 0
155
+ gamma: 1
156
+ kl_algorithm: full
157
+ kl_type: forward
158
+ model:
159
+ _target_: mtp.models.mtp.MultiTokenLM
160
+ beta: 0
161
+ circuit: ${circuit.model}
162
+ gamma: 0.9
163
+ init_from_lm_head: true
164
+ kl_algorithm: ${model.kl_algorithm}
165
+ kl_type: ${model.kl_type}
166
+ lm: ${lm.model}
167
+ mt_head_kwargs: ${mt_head.hyperparameters}
168
+ name: mtp
169
+ mt_head:
170
+ value:
171
+ hyperparameters:
172
+ contextual_hmm_weights: true
173
+ expander_n_layer: 1
174
+ expander_type: linear
175
+ freeze_vocab_unembedding: false
176
+ init_hmm_identity: true
177
+ n_embd: 3072
178
+ share_sum_weights: false
179
+ transformer_n_head: 24
180
+ transformer_n_layer: 0
181
+ type: evabyte
182
+ name: linear-evabyte
183
+ name:
184
+ value: nanogpt
185
+ training:
186
+ value:
187
+ batch_size: 256
188
+ device_batch_size: 1
189
+ expname: llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
190
+ learning_rate: 0.0003
191
+ num_iterations: 900
192
+ random_seed: 13
193
+ save_model: true
194
+ save_model_every: 100
195
+ save_optimizer: true
196
+ sequence_length: 8192
197
+ use_scheduler: false
198
+ val_loss_every: 100
199
+ val_tokens: 4194304
wandb/run-20251128_151948-j8dmy8fe/files/output.log ADDED
@@ -0,0 +1,951 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-11-28 15:19:49,942] - Saving config and checkpoints to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34...
2
+ [2025-11-28 15:19:49,942] - Save model: True...
3
+ [2025-11-28 15:19:49,943] - Save optimizer: True...
4
+ [2025-11-28 15:19:49,950] - Training on agrv/tulu-v3-sft-llama3-packed-seq-len-8192...
5
+ Generating train split: 100%|██████████| 237482/237482 [00:08<00:00, 28424.55 examples/s]
6
+ Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
7
+ [2025-11-28 15:20:29,738] - Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
8
+ Generating valid split: 100%|██████████| 2399/2399 [00:00<00:00, 5296.71 examples/s]
9
+ [2025-11-28 15:20:31,881] - step:0/900 Saving model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@0.pt...
10
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
11
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
12
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] Graph break from `Tensor.item()`, consider setting:
13
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] torch._dynamo.config.capture_scalar_outputs = True
14
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] or:
15
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
16
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] to include these operations in the captured graph.
17
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0]
18
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] Graph break: from user code at:
19
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] File "/disk/scratch/agrivas/nanoGPT/mtp/models/mtp.py", line 212, in torch_dynamo_resume_in_forward_at_204
20
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] xxd = self.lm.encoder(
21
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] File "/disk/scratch/agrivas/nanoGPT/data/modules/transformers_modules/benjamin/Llama3-2-3B-IT-Byte/19d951e04213250d844131bce370ae9c752eb7e9/modelling_tpu_llama.py", line 939, in forward
22
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] expanded_input_ids = torch_expand_input_ids(
23
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] File "/disk/scratch/agrivas/nanoGPT/data/modules/transformers_modules/benjamin/Llama3-2-3B-IT-Byte/19d951e04213250d844131bce370ae9c752eb7e9/modelling_tpu_llama.py", line 71, in torch_expand_input_ids
24
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] last_maxlen_ids.insert(0, int(input_ids[example_idx][i] + 1))
25
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0]
26
+ [rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0]
27
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:194: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
28
+ warnings.warn(
29
+ [rank0]:W1128 15:21:06.801000 3738330 torch/_dynamo/exc.py:304] [11/0] Backend compiler failed with a fake tensor exception at
30
+ [rank0]:W1128 15:21:06.801000 3738330 torch/_dynamo/exc.py:304] [11/0] File "/disk/scratch/agrivas/nanoGPT/mtp/utils/packing.py", line 37, in torch_dynamo_resume_in_packed_targets_to_target_windows_at_32
31
+ [rank0]:W1128 15:21:06.801000 3738330 torch/_dynamo/exc.py:304] [11/0] return torch.concat(parts, dim=0).reshape(B, S, n)
32
+ [rank0]:W1128 15:21:06.801000 3738330 torch/_dynamo/exc.py:304] [11/0] Adding a graph break.
33
+ [rank0]:W1128 15:21:06.894000 3738330 torch/_dynamo/exc.py:304] [11/0_1] Backend compiler failed with a fake tensor exception at
34
+ [rank0]:W1128 15:21:06.894000 3738330 torch/_dynamo/exc.py:304] [11/0_1] File "/disk/scratch/agrivas/nanoGPT/mtp/utils/packing.py", line 37, in torch_dynamo_resume_in_packed_targets_to_target_windows_at_32
35
+ [rank0]:W1128 15:21:06.894000 3738330 torch/_dynamo/exc.py:304] [11/0_1] return torch.concat(parts, dim=0).reshape(B, S, n)
36
+ [rank0]:W1128 15:21:06.894000 3738330 torch/_dynamo/exc.py:304] [11/0_1] Adding a graph break.
37
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
38
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
39
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
40
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
41
+ [2025-11-28 15:24:47,442] - step:1/900 train_loss:4.6532 lr:0.0003000000 time/step:254.93s
42
+ [2025-11-28 15:27:51,454] - step:2/900 train_loss:4.2966 lr:0.0003000000 time/step:184.01s
43
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
44
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
45
+ [2025-11-28 15:30:57,835] - step:3/900 train_loss:3.9828 lr:0.0003000000 time/step:186.37s
46
+ [2025-11-28 15:34:08,017] - step:4/900 train_loss:3.6910 lr:0.0003000000 time/step:190.16s
47
+ [2025-11-28 15:37:15,451] - step:5/900 train_loss:3.4752 lr:0.0003000000 time/step:187.40s
48
+ [2025-11-28 15:40:23,150] - step:6/900 train_loss:3.3047 lr:0.0003000000 time/step:187.69s
49
+ [2025-11-28 15:43:30,606] - step:7/900 train_loss:3.1140 lr:0.0003000000 time/step:187.45s
50
+ [2025-11-28 15:46:34,854] - step:8/900 train_loss:2.9731 lr:0.0003000000 time/step:184.24s
51
+ [2025-11-28 15:49:37,899] - step:9/900 train_loss:2.8709 lr:0.0003000000 time/step:183.04s
52
+ [2025-11-28 15:52:42,161] - step:10/900 train_loss:2.7582 lr:0.0003000000 time/step:184.25s
53
+ [2025-11-28 15:55:49,659] - step:11/900 train_loss:2.6474 lr:0.0003000000 time/step:187.49s
54
+ [2025-11-28 15:58:56,730] - step:12/900 train_loss:2.5890 lr:0.0003000000 time/step:187.06s
55
+ [2025-11-28 16:02:04,893] - step:13/900 train_loss:2.5418 lr:0.0003000000 time/step:188.16s
56
+ [2025-11-28 16:05:11,642] - step:14/900 train_loss:2.4586 lr:0.0003000000 time/step:186.74s
57
+ [2025-11-28 16:08:18,599] - step:15/900 train_loss:2.3908 lr:0.0003000000 time/step:186.94s
58
+ [2025-11-28 16:11:24,210] - step:16/900 train_loss:2.3323 lr:0.0003000000 time/step:185.60s
59
+ [2025-11-28 16:14:28,421] - step:17/900 train_loss:2.2802 lr:0.0003000000 time/step:184.20s
60
+ [2025-11-28 16:17:31,315] - step:18/900 train_loss:2.2268 lr:0.0003000000 time/step:182.88s
61
+ [2025-11-28 16:20:33,212] - step:19/900 train_loss:2.2212 lr:0.0003000000 time/step:181.88s
62
+ [2025-11-28 16:23:39,339] - step:20/900 train_loss:2.1965 lr:0.0003000000 time/step:186.12s
63
+ [2025-11-28 16:26:46,366] - step:21/900 train_loss:2.1549 lr:0.0003000000 time/step:187.01s
64
+ [2025-11-28 16:29:53,452] - step:22/900 train_loss:2.0844 lr:0.0003000000 time/step:187.07s
65
+ [2025-11-28 16:33:01,636] - step:23/900 train_loss:2.0673 lr:0.0003000000 time/step:188.18s
66
+ [2025-11-28 16:36:09,091] - step:24/900 train_loss:2.0375 lr:0.0003000000 time/step:187.44s
67
+ [2025-11-28 16:39:16,657] - step:25/900 train_loss:2.0299 lr:0.0003000000 time/step:187.55s
68
+ [2025-11-28 16:42:25,561] - step:26/900 train_loss:1.9910 lr:0.0003000000 time/step:188.90s
69
+ [2025-11-28 16:45:33,283] - step:27/900 train_loss:1.9708 lr:0.0003000000 time/step:187.68s
70
+ [2025-11-28 16:48:40,284] - step:28/900 train_loss:1.9105 lr:0.0003000000 time/step:186.98s
71
+ [2025-11-28 16:51:47,205] - step:29/900 train_loss:1.9014 lr:0.0003000000 time/step:186.92s
72
+ [2025-11-28 16:54:56,592] - step:30/900 train_loss:1.8643 lr:0.0003000000 time/step:189.38s
73
+ [2025-11-28 16:58:04,452] - step:31/900 train_loss:1.8593 lr:0.0003000000 time/step:187.84s
74
+ [2025-11-28 17:01:11,681] - step:32/900 train_loss:1.8733 lr:0.0003000000 time/step:187.21s
75
+ [2025-11-28 17:04:19,862] - step:33/900 train_loss:1.7975 lr:0.0003000000 time/step:188.17s
76
+ [2025-11-28 17:07:27,610] - step:34/900 train_loss:1.8307 lr:0.0003000000 time/step:187.74s
77
+ [2025-11-28 17:10:35,249] - step:35/900 train_loss:1.8018 lr:0.0003000000 time/step:187.63s
78
+ [2025-11-28 17:13:46,950] - step:36/900 train_loss:1.8066 lr:0.0003000000 time/step:191.69s
79
+ [2025-11-28 17:16:53,853] - step:37/900 train_loss:1.7636 lr:0.0003000000 time/step:186.82s
80
+ [2025-11-28 17:20:00,571] - step:38/900 train_loss:1.7714 lr:0.0003000000 time/step:186.54s
81
+ [2025-11-28 17:23:13,248] - step:39/900 train_loss:1.7096 lr:0.0003000000 time/step:192.65s
82
+ [2025-11-28 17:26:19,575] - step:40/900 train_loss:1.7411 lr:0.0003000000 time/step:186.29s
83
+ [2025-11-28 17:29:25,904] - step:41/900 train_loss:1.6913 lr:0.0003000000 time/step:186.27s
84
+ [2025-11-28 17:32:44,977] - step:42/900 train_loss:1.7001 lr:0.0003000000 time/step:199.05s
85
+ [2025-11-28 17:35:51,243] - step:43/900 train_loss:1.6629 lr:0.0003000000 time/step:186.21s
86
+ [2025-11-28 17:38:57,339] - step:44/900 train_loss:1.6610 lr:0.0003000000 time/step:185.79s
87
+ [2025-11-28 17:42:05,062] - step:45/900 train_loss:1.6524 lr:0.0003000000 time/step:187.68s
88
+ [2025-11-28 17:45:20,648] - step:46/900 train_loss:1.6555 lr:0.0003000000 time/step:195.50s
89
+ [2025-11-28 17:48:26,366] - step:47/900 train_loss:1.6223 lr:0.0003000000 time/step:185.70s
90
+ [2025-11-28 17:51:34,666] - step:48/900 train_loss:1.6481 lr:0.0003000000 time/step:188.12s
91
+ [2025-11-28 17:54:51,245] - step:49/900 train_loss:1.6112 lr:0.0003000000 time/step:196.52s
92
+ [2025-11-28 17:57:57,507] - step:50/900 train_loss:1.6013 lr:0.0003000000 time/step:186.19s
93
+ [2025-11-28 18:01:05,674] - step:51/900 train_loss:1.5772 lr:0.0003000000 time/step:187.99s
94
+ [2025-11-28 18:04:21,278] - step:52/900 train_loss:1.5660 lr:0.0003000000 time/step:195.58s
95
+ [2025-11-28 18:07:27,447] - step:53/900 train_loss:1.5702 lr:0.0003000000 time/step:186.11s
96
+ [2025-11-28 18:10:33,793] - step:54/900 train_loss:1.5665 lr:0.0003000000 time/step:186.26s
97
+ [2025-11-28 18:13:53,962] - step:55/900 train_loss:1.5804 lr:0.0003000000 time/step:200.15s
98
+ [2025-11-28 18:17:00,673] - step:56/900 train_loss:1.5645 lr:0.0003000000 time/step:186.66s
99
+ [2025-11-28 18:20:06,961] - step:57/900 train_loss:1.5609 lr:0.0003000000 time/step:186.23s
100
+ [2025-11-28 18:23:24,919] - step:58/900 train_loss:1.5356 lr:0.0003000000 time/step:197.90s
101
+ [2025-11-28 18:26:31,137] - step:59/900 train_loss:1.5277 lr:0.0003000000 time/step:186.18s
102
+ [2025-11-28 18:29:37,442] - step:60/900 train_loss:1.5330 lr:0.0003000000 time/step:186.22s
103
+ [2025-11-28 18:32:45,572] - step:61/900 train_loss:1.5127 lr:0.0003000000 time/step:188.07s
104
+ [2025-11-28 18:36:01,349] - step:62/900 train_loss:1.5127 lr:0.0003000000 time/step:195.75s
105
+ [2025-11-28 18:39:08,044] - step:63/900 train_loss:1.5255 lr:0.0003000000 time/step:186.63s
106
+ [2025-11-28 18:42:16,514] - step:64/900 train_loss:1.4881 lr:0.0003000000 time/step:188.39s
107
+ [2025-11-28 18:45:32,575] - step:65/900 train_loss:1.4746 lr:0.0003000000 time/step:196.00s
108
+ [2025-11-28 18:48:39,543] - step:66/900 train_loss:1.5017 lr:0.0003000000 time/step:186.89s
109
+ [2025-11-28 18:51:47,768] - step:67/900 train_loss:1.4805 lr:0.0003000000 time/step:188.07s
110
+ [2025-11-28 18:55:03,564] - step:68/900 train_loss:1.4929 lr:0.0003000000 time/step:195.75s
111
+ [2025-11-28 18:58:10,293] - step:69/900 train_loss:1.4550 lr:0.0003000000 time/step:186.67s
112
+ [2025-11-28 19:01:16,800] - step:70/900 train_loss:1.4532 lr:0.0003000000 time/step:186.44s
113
+ [2025-11-28 19:04:26,999] - step:71/900 train_loss:1.4520 lr:0.0003000000 time/step:190.18s
114
+ [2025-11-28 19:07:33,259] - step:72/900 train_loss:1.4301 lr:0.0003000000 time/step:186.22s
115
+ [2025-11-28 19:10:39,471] - step:73/900 train_loss:1.4337 lr:0.0003000000 time/step:186.20s
116
+ [2025-11-28 19:13:47,822] - step:74/900 train_loss:1.4296 lr:0.0003000000 time/step:188.33s
117
+ [2025-11-28 19:16:53,884] - step:75/900 train_loss:1.4294 lr:0.0003000000 time/step:186.04s
118
+ [2025-11-28 19:19:59,845] - step:76/900 train_loss:1.4367 lr:0.0003000000 time/step:185.94s
119
+ [2025-11-28 19:23:05,617] - step:77/900 train_loss:1.4359 lr:0.0003000000 time/step:185.76s
120
+ [2025-11-28 19:26:13,471] - step:78/900 train_loss:1.3907 lr:0.0003000000 time/step:187.84s
121
+ [2025-11-28 19:29:19,325] - step:79/900 train_loss:1.4074 lr:0.0003000000 time/step:185.83s
122
+ [2025-11-28 19:32:24,915] - step:80/900 train_loss:1.3818 lr:0.0003000000 time/step:185.57s
123
+ [2025-11-28 19:35:32,821] - step:81/900 train_loss:1.3966 lr:0.0003000000 time/step:187.89s
124
+ [2025-11-28 19:38:38,468] - step:82/900 train_loss:1.3767 lr:0.0003000000 time/step:185.62s
125
+ [2025-11-28 19:41:44,296] - step:83/900 train_loss:1.3772 lr:0.0003000000 time/step:185.82s
126
+ [2025-11-28 19:44:52,361] - step:84/900 train_loss:1.3639 lr:0.0003000000 time/step:188.06s
127
+ [2025-11-28 19:47:59,370] - step:85/900 train_loss:1.3910 lr:0.0003000000 time/step:186.99s
128
+ [2025-11-28 19:51:05,447] - step:86/900 train_loss:1.4013 lr:0.0003000000 time/step:186.07s
129
+ [2025-11-28 19:54:13,032] - step:87/900 train_loss:1.3883 lr:0.0003000000 time/step:187.58s
130
+ [2025-11-28 19:57:19,138] - step:88/900 train_loss:1.3712 lr:0.0003000000 time/step:186.09s
131
+ [2025-11-28 20:00:25,142] - step:89/900 train_loss:1.3749 lr:0.0003000000 time/step:185.98s
132
+ [2025-11-28 20:03:30,825] - step:90/900 train_loss:1.3630 lr:0.0003000000 time/step:185.67s
133
+ [2025-11-28 20:06:38,585] - step:91/900 train_loss:1.3713 lr:0.0003000000 time/step:187.75s
134
+ [2025-11-28 20:09:44,867] - step:92/900 train_loss:1.3503 lr:0.0003000000 time/step:186.27s
135
+ [2025-11-28 20:12:50,830] - step:93/900 train_loss:1.3537 lr:0.0003000000 time/step:185.94s
136
+ [2025-11-28 20:15:58,624] - step:94/900 train_loss:1.3468 lr:0.0003000000 time/step:187.79s
137
+ [2025-11-28 20:19:04,543] - step:95/900 train_loss:1.3603 lr:0.0003000000 time/step:185.91s
138
+ [2025-11-28 20:22:10,848] - step:96/900 train_loss:1.3216 lr:0.0003000000 time/step:186.29s
139
+ [2025-11-28 20:25:17,756] - step:97/900 train_loss:1.3276 lr:0.0003000000 time/step:186.90s
140
+ [2025-11-28 20:28:22,895] - step:98/900 train_loss:1.3128 lr:0.0003000000 time/step:185.09s
141
+ [2025-11-28 20:31:28,093] - step:99/900 train_loss:1.3014 lr:0.0003000000 time/step:185.13s
142
+ [2025-11-28 20:34:37,788] - step:100/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@100.pt...
143
+ [2025-11-28 20:34:37,813] - step:100/900 train_loss:1.3411 lr:0.0003000000 time/step:187.79s
144
+ [2025-11-28 20:37:43,371] - step:101/900 train_loss:1.3414 lr:0.0003000000 time/step:185.55s
145
+ [2025-11-28 20:40:49,246] - step:102/900 train_loss:1.3098 lr:0.0003000000 time/step:185.84s
146
+ [2025-11-28 20:43:55,147] - step:103/900 train_loss:1.3077 lr:0.0003000000 time/step:185.90s
147
+ [2025-11-28 20:47:03,589] - step:104/900 train_loss:1.3283 lr:0.0003000000 time/step:188.43s
148
+ [2025-11-28 20:50:09,456] - step:105/900 train_loss:1.3107 lr:0.0003000000 time/step:185.85s
149
+ [2025-11-28 20:53:15,133] - step:106/900 train_loss:1.3116 lr:0.0003000000 time/step:185.65s
150
+ [2025-11-28 20:56:23,079] - step:107/900 train_loss:1.3076 lr:0.0003000000 time/step:187.94s
151
+ [2025-11-28 20:59:29,309] - step:108/900 train_loss:1.2576 lr:0.0003000000 time/step:186.19s
152
+ [2025-11-28 21:02:34,096] - step:109/900 train_loss:1.3163 lr:0.0003000000 time/step:184.77s
153
+ [2025-11-28 21:05:38,534] - step:110/900 train_loss:1.2836 lr:0.0003000000 time/step:184.43s
154
+ [2025-11-28 21:08:41,909] - step:111/900 train_loss:1.2887 lr:0.0003000000 time/step:183.34s
155
+ [2025-11-28 21:11:44,652] - step:112/900 train_loss:1.2900 lr:0.0003000000 time/step:182.72s
156
+ [2025-11-28 21:14:49,050] - step:113/900 train_loss:1.3032 lr:0.0003000000 time/step:184.39s
157
+ [2025-11-28 21:17:51,714] - step:114/900 train_loss:1.2715 lr:0.0003000000 time/step:182.65s
158
+ [2025-11-28 21:20:54,366] - step:115/900 train_loss:1.2553 lr:0.0003000000 time/step:182.64s
159
+ [2025-11-28 21:23:58,585] - step:116/900 train_loss:1.2608 lr:0.0003000000 time/step:184.21s
160
+ [2025-11-28 21:27:05,711] - step:117/900 train_loss:1.2750 lr:0.0003000000 time/step:187.12s
161
+ [2025-11-28 21:30:10,632] - step:118/900 train_loss:1.2610 lr:0.0003000000 time/step:184.91s
162
+ [2025-11-28 21:33:15,980] - step:119/900 train_loss:1.2728 lr:0.0003000000 time/step:185.32s
163
+ [2025-11-28 21:36:22,993] - step:120/900 train_loss:1.2367 lr:0.0003000000 time/step:187.01s
164
+ [2025-11-28 21:39:27,798] - step:121/900 train_loss:1.2436 lr:0.0003000000 time/step:184.79s
165
+ [2025-11-28 21:42:32,716] - step:122/900 train_loss:1.2680 lr:0.0003000000 time/step:184.90s
166
+ [2025-11-28 21:45:39,837] - step:123/900 train_loss:1.2459 lr:0.0003000000 time/step:187.11s
167
+ [2025-11-28 21:48:44,604] - step:124/900 train_loss:1.2356 lr:0.0003000000 time/step:184.76s
168
+ [2025-11-28 21:51:49,462] - step:125/900 train_loss:1.2116 lr:0.0003000000 time/step:184.84s
169
+ [2025-11-28 21:54:56,152] - step:126/900 train_loss:1.2271 lr:0.0003000000 time/step:186.68s
170
+ [2025-11-28 21:58:02,698] - step:127/900 train_loss:1.2747 lr:0.0003000000 time/step:186.53s
171
+ [2025-11-28 22:01:07,919] - step:128/900 train_loss:1.2662 lr:0.0003000000 time/step:185.21s
172
+ [2025-11-28 22:04:13,473] - step:129/900 train_loss:1.2508 lr:0.0003000000 time/step:185.54s
173
+ [2025-11-28 22:07:19,897] - step:130/900 train_loss:1.2417 lr:0.0003000000 time/step:186.41s
174
+ [2025-11-28 22:10:24,163] - step:131/900 train_loss:1.2469 lr:0.0003000000 time/step:184.26s
175
+ [2025-11-28 22:13:29,588] - step:132/900 train_loss:1.2212 lr:0.0003000000 time/step:185.42s
176
+ [2025-11-28 22:16:36,724] - step:133/900 train_loss:1.2154 lr:0.0003000000 time/step:187.11s
177
+ [2025-11-28 22:19:41,361] - step:134/900 train_loss:1.1905 lr:0.0003000000 time/step:184.62s
178
+ [2025-11-28 22:22:46,426] - step:135/900 train_loss:1.2090 lr:0.0003000000 time/step:185.04s
179
+ [2025-11-28 22:25:53,482] - step:136/900 train_loss:1.2180 lr:0.0003000000 time/step:187.04s
180
+ [2025-11-28 22:28:58,396] - step:137/900 train_loss:1.2309 lr:0.0003000000 time/step:184.90s
181
+ [2025-11-28 22:32:02,953] - step:138/900 train_loss:1.2127 lr:0.0003000000 time/step:184.53s
182
+ [2025-11-28 22:35:08,685] - step:139/900 train_loss:1.2126 lr:0.0003000000 time/step:185.71s
183
+ [2025-11-28 22:38:15,825] - step:140/900 train_loss:1.2117 lr:0.0003000000 time/step:187.09s
184
+ [2025-11-28 22:41:20,366] - step:141/900 train_loss:1.2301 lr:0.0003000000 time/step:184.53s
185
+ [2025-11-28 22:44:24,896] - step:142/900 train_loss:1.2388 lr:0.0003000000 time/step:184.52s
186
+ [2025-11-28 22:47:31,624] - step:143/900 train_loss:1.1987 lr:0.0003000000 time/step:186.71s
187
+ [2025-11-28 22:50:37,358] - step:144/900 train_loss:1.2210 lr:0.0003000000 time/step:185.73s
188
+ [2025-11-28 22:53:43,613] - step:145/900 train_loss:1.2170 lr:0.0003000000 time/step:186.22s
189
+ [2025-11-28 22:57:06,629] - step:146/900 train_loss:1.2236 lr:0.0003000000 time/step:203.01s
190
+ [2025-11-28 23:00:09,814] - step:147/900 train_loss:1.2255 lr:0.0003000000 time/step:183.18s
191
+ [2025-11-28 23:03:14,149] - step:148/900 train_loss:1.1806 lr:0.0003000000 time/step:184.31s
192
+ [2025-11-28 23:06:23,397] - step:149/900 train_loss:1.2233 lr:0.0003000000 time/step:189.23s
193
+ [2025-11-28 23:09:30,162] - step:150/900 train_loss:1.1677 lr:0.0003000000 time/step:186.75s
194
+ [2025-11-28 23:12:34,786] - step:151/900 train_loss:1.2155 lr:0.0003000000 time/step:184.59s
195
+ [2025-11-28 23:15:41,431] - step:152/900 train_loss:1.1948 lr:0.0003000000 time/step:186.63s
196
+ [2025-11-28 23:18:47,806] - step:153/900 train_loss:1.1950 lr:0.0003000000 time/step:186.35s
197
+ [2025-11-28 23:21:52,115] - step:154/900 train_loss:1.2133 lr:0.0003000000 time/step:184.28s
198
+ [2025-11-28 23:24:56,981] - step:155/900 train_loss:1.1862 lr:0.0003000000 time/step:184.85s
199
+ [2025-11-28 23:28:03,290] - step:156/900 train_loss:1.1699 lr:0.0003000000 time/step:186.29s
200
+ [2025-11-28 23:31:07,306] - step:157/900 train_loss:1.1773 lr:0.0003000000 time/step:184.00s
201
+ [2025-11-28 23:34:12,414] - step:158/900 train_loss:1.1680 lr:0.0003000000 time/step:185.10s
202
+ [2025-11-28 23:37:19,900] - step:159/900 train_loss:1.1806 lr:0.0003000000 time/step:187.45s
203
+ [2025-11-28 23:40:24,615] - step:160/900 train_loss:1.1865 lr:0.0003000000 time/step:184.70s
204
+ [2025-11-28 23:43:29,245] - step:161/900 train_loss:1.1872 lr:0.0003000000 time/step:184.61s
205
+ [2025-11-28 23:46:36,811] - step:162/900 train_loss:1.1806 lr:0.0003000000 time/step:187.56s
206
+ [2025-11-28 23:49:41,637] - step:163/900 train_loss:1.1750 lr:0.0003000000 time/step:184.79s
207
+ [2025-11-28 23:52:45,829] - step:164/900 train_loss:1.1828 lr:0.0003000000 time/step:184.16s
208
+ [2025-11-28 23:55:50,721] - step:165/900 train_loss:1.1742 lr:0.0003000000 time/step:184.88s
209
+ [2025-11-28 23:58:57,667] - step:166/900 train_loss:1.1655 lr:0.0003000000 time/step:186.93s
210
+ [2025-11-29 00:02:02,656] - step:167/900 train_loss:1.1631 lr:0.0003000000 time/step:184.97s
211
+ [2025-11-29 00:05:08,306] - step:168/900 train_loss:1.1614 lr:0.0003000000 time/step:185.63s
212
+ [2025-11-29 00:08:15,208] - step:169/900 train_loss:1.1613 lr:0.0003000000 time/step:186.89s
213
+ [2025-11-29 00:11:19,829] - step:170/900 train_loss:1.1623 lr:0.0003000000 time/step:184.60s
214
+ [2025-11-29 00:14:25,137] - step:171/900 train_loss:1.1538 lr:0.0003000000 time/step:185.30s
215
+ [2025-11-29 00:17:32,364] - step:172/900 train_loss:1.1782 lr:0.0003000000 time/step:187.22s
216
+ [2025-11-29 00:20:37,216] - step:173/900 train_loss:1.1596 lr:0.0003000000 time/step:184.84s
217
+ [2025-11-29 00:23:42,361] - step:174/900 train_loss:1.1381 lr:0.0003000000 time/step:185.12s
218
+ [2025-11-29 00:26:49,327] - step:175/900 train_loss:1.1305 lr:0.0003000000 time/step:186.96s
219
+ [2025-11-29 00:29:54,460] - step:176/900 train_loss:1.1603 lr:0.0003000000 time/step:185.12s
220
+ [2025-11-29 00:32:59,491] - step:177/900 train_loss:1.1435 lr:0.0003000000 time/step:185.01s
221
+ [2025-11-29 00:36:04,756] - step:178/900 train_loss:1.1653 lr:0.0003000000 time/step:185.25s
222
+ [2025-11-29 00:39:11,804] - step:179/900 train_loss:1.1443 lr:0.0003000000 time/step:187.04s
223
+ [2025-11-29 00:42:16,834] - step:180/900 train_loss:1.1554 lr:0.0003000000 time/step:185.01s
224
+ [2025-11-29 00:45:22,795] - step:181/900 train_loss:1.1495 lr:0.0003000000 time/step:185.95s
225
+ [2025-11-29 00:48:30,739] - step:182/900 train_loss:1.1251 lr:0.0003000000 time/step:187.94s
226
+ [2025-11-29 00:51:34,795] - step:183/900 train_loss:1.1323 lr:0.0003000000 time/step:184.04s
227
+ [2025-11-29 00:54:39,599] - step:184/900 train_loss:1.1293 lr:0.0003000000 time/step:184.80s
228
+ [2025-11-29 00:57:45,600] - step:185/900 train_loss:1.1500 lr:0.0003000000 time/step:185.99s
229
+ [2025-11-29 01:00:49,413] - step:186/900 train_loss:1.1429 lr:0.0003000000 time/step:183.79s
230
+ [2025-11-29 01:03:54,362] - step:187/900 train_loss:1.1384 lr:0.0003000000 time/step:184.93s
231
+ [2025-11-29 01:07:01,673] - step:188/900 train_loss:1.1665 lr:0.0003000000 time/step:187.31s
232
+ [2025-11-29 01:10:06,793] - step:189/900 train_loss:1.1470 lr:0.0003000000 time/step:185.10s
233
+ [2025-11-29 01:13:11,822] - step:190/900 train_loss:1.1562 lr:0.0003000000 time/step:185.00s
234
+ [2025-11-29 01:16:16,209] - step:191/900 train_loss:1.1811 lr:0.0003000000 time/step:184.37s
235
+ [2025-11-29 01:19:22,340] - step:192/900 train_loss:1.1471 lr:0.0003000000 time/step:186.13s
236
+ [2025-11-29 01:22:26,519] - step:193/900 train_loss:1.1428 lr:0.0003000000 time/step:184.15s
237
+ [2025-11-29 01:25:31,429] - step:194/900 train_loss:1.1208 lr:0.0003000000 time/step:184.89s
238
+ [2025-11-29 01:28:36,974] - step:195/900 train_loss:1.1308 lr:0.0003000000 time/step:185.54s
239
+ [2025-11-29 01:31:40,544] - step:196/900 train_loss:1.1228 lr:0.0003000000 time/step:183.54s
240
+ [2025-11-29 01:34:45,938] - step:197/900 train_loss:1.1161 lr:0.0003000000 time/step:185.38s
241
+ [2025-11-29 01:37:53,156] - step:198/900 train_loss:1.1478 lr:0.0003000000 time/step:187.21s
242
+ [2025-11-29 01:40:58,171] - step:199/900 train_loss:1.1103 lr:0.0003000000 time/step:184.99s
243
+ [2025-11-29 01:44:05,489] - step:200/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@200.pt...
244
+ [2025-11-29 01:44:05,525] - step:200/900 train_loss:1.1274 lr:0.0003000000 time/step:185.55s
245
+ [2025-11-29 01:47:14,488] - step:201/900 train_loss:1.1234 lr:0.0003000000 time/step:188.94s
246
+ [2025-11-29 01:50:21,411] - step:202/900 train_loss:1.1199 lr:0.0003000000 time/step:186.91s
247
+ [2025-11-29 01:53:26,628] - step:203/900 train_loss:1.0972 lr:0.0003000000 time/step:185.20s
248
+ [2025-11-29 01:56:32,570] - step:204/900 train_loss:1.1371 lr:0.0003000000 time/step:185.92s
249
+ [2025-11-29 01:59:41,140] - step:205/900 train_loss:1.1408 lr:0.0003000000 time/step:188.56s
250
+ [2025-11-29 02:02:45,379] - step:206/900 train_loss:1.0997 lr:0.0003000000 time/step:184.22s
251
+ [2025-11-29 02:05:50,066] - step:207/900 train_loss:1.1332 lr:0.0003000000 time/step:184.67s
252
+ [2025-11-29 02:08:56,311] - step:208/900 train_loss:1.1209 lr:0.0003000000 time/step:186.24s
253
+ [2025-11-29 02:12:00,567] - step:209/900 train_loss:1.0919 lr:0.0003000000 time/step:184.22s
254
+ [2025-11-29 02:15:04,792] - step:210/900 train_loss:1.1005 lr:0.0003000000 time/step:184.22s
255
+ [2025-11-29 02:18:10,654] - step:211/900 train_loss:1.1036 lr:0.0003000000 time/step:185.86s
256
+ [2025-11-29 02:21:14,585] - step:212/900 train_loss:1.1229 lr:0.0003000000 time/step:183.92s
257
+ [2025-11-29 02:24:19,368] - step:213/900 train_loss:1.1051 lr:0.0003000000 time/step:184.77s
258
+ [2025-11-29 02:27:26,145] - step:214/900 train_loss:1.1085 lr:0.0003000000 time/step:186.77s
259
+ [2025-11-29 02:30:30,712] - step:215/900 train_loss:1.0930 lr:0.0003000000 time/step:184.56s
260
+ [2025-11-29 02:33:34,774] - step:216/900 train_loss:1.0977 lr:0.0003000000 time/step:184.05s
261
+ [2025-11-29 02:36:40,292] - step:217/900 train_loss:1.1187 lr:0.0003000000 time/step:185.51s
262
+ [2025-11-29 02:39:49,043] - step:218/900 train_loss:1.0909 lr:0.0003000000 time/step:188.73s
263
+ [2025-11-29 02:42:54,991] - step:219/900 train_loss:1.1056 lr:0.0003000000 time/step:185.90s
264
+ [2025-11-29 02:46:00,394] - step:220/900 train_loss:1.1048 lr:0.0003000000 time/step:185.40s
265
+ [2025-11-29 02:49:07,579] - step:221/900 train_loss:1.1078 lr:0.0003000000 time/step:187.17s
266
+ [2025-11-29 02:52:12,146] - step:222/900 train_loss:1.1114 lr:0.0003000000 time/step:184.54s
267
+ [2025-11-29 02:55:16,480] - step:223/900 train_loss:1.1062 lr:0.0003000000 time/step:184.32s
268
+ [2025-11-29 02:58:22,768] - step:224/900 train_loss:1.1142 lr:0.0003000000 time/step:186.28s
269
+ [2025-11-29 03:01:26,953] - step:225/900 train_loss:1.0961 lr:0.0003000000 time/step:184.17s
270
+ [2025-11-29 03:04:31,749] - step:226/900 train_loss:1.0917 lr:0.0003000000 time/step:184.78s
271
+ [2025-11-29 03:07:38,529] - step:227/900 train_loss:1.0934 lr:0.0003000000 time/step:186.77s
272
+ [2025-11-29 03:10:43,271] - step:228/900 train_loss:1.1069 lr:0.0003000000 time/step:184.70s
273
+ [2025-11-29 03:13:48,167] - step:229/900 train_loss:1.0734 lr:0.0003000000 time/step:184.88s
274
+ [2025-11-29 03:16:52,812] - step:230/900 train_loss:1.0957 lr:0.0003000000 time/step:184.63s
275
+ [2025-11-29 03:19:58,801] - step:231/900 train_loss:1.0775 lr:0.0003000000 time/step:185.98s
276
+ [2025-11-29 03:23:02,987] - step:232/900 train_loss:1.0926 lr:0.0003000000 time/step:184.16s
277
+ [2025-11-29 03:26:08,180] - step:233/900 train_loss:1.1314 lr:0.0003000000 time/step:185.19s
278
+ [2025-11-29 03:29:14,462] - step:234/900 train_loss:1.0868 lr:0.0003000000 time/step:186.28s
279
+ [2025-11-29 03:32:19,081] - step:235/900 train_loss:1.0808 lr:0.0003000000 time/step:184.59s
280
+ [2025-11-29 03:35:24,243] - step:236/900 train_loss:1.0749 lr:0.0003000000 time/step:185.16s
281
+ [2025-11-29 03:38:31,254] - step:237/900 train_loss:1.1269 lr:0.0003000000 time/step:187.01s
282
+ [2025-11-29 03:41:35,966] - step:238/900 train_loss:1.0924 lr:0.0003000000 time/step:184.69s
283
+ [2025-11-29 03:44:41,260] - step:239/900 train_loss:1.0906 lr:0.0003000000 time/step:185.27s
284
+ [2025-11-29 03:47:49,206] - step:240/900 train_loss:1.0918 lr:0.0003000000 time/step:187.94s
285
+ [2025-11-29 03:50:54,694] - step:241/900 train_loss:1.0946 lr:0.0003000000 time/step:185.46s
286
+ [2025-11-29 03:53:59,535] - step:242/900 train_loss:1.1074 lr:0.0003000000 time/step:184.80s
287
+ [2025-11-29 03:57:04,220] - step:243/900 train_loss:1.0943 lr:0.0003000000 time/step:184.67s
288
+ [2025-11-29 04:00:10,432] - step:244/900 train_loss:1.0711 lr:0.0003000000 time/step:186.21s
289
+ [2025-11-29 04:03:15,729] - step:245/900 train_loss:1.1061 lr:0.0003000000 time/step:185.26s
290
+ [2025-11-29 04:06:20,984] - step:246/900 train_loss:1.0789 lr:0.0003000000 time/step:185.24s
291
+ [2025-11-29 04:09:27,749] - step:247/900 train_loss:1.0778 lr:0.0003000000 time/step:186.76s
292
+ [2025-11-29 04:12:33,149] - step:248/900 train_loss:1.0830 lr:0.0003000000 time/step:185.36s
293
+ [2025-11-29 04:15:37,995] - step:249/900 train_loss:1.0921 lr:0.0003000000 time/step:184.84s
294
+ [2025-11-29 04:18:44,391] - step:250/900 train_loss:1.0980 lr:0.0003000000 time/step:186.39s
295
+ [2025-11-29 04:21:49,000] - step:251/900 train_loss:1.0761 lr:0.0003000000 time/step:184.59s
296
+ [2025-11-29 04:24:54,274] - step:252/900 train_loss:1.0901 lr:0.0003000000 time/step:185.25s
297
+ [2025-11-29 04:28:02,058] - step:253/900 train_loss:1.0735 lr:0.0003000000 time/step:187.78s
298
+ [2025-11-29 04:31:09,878] - step:254/900 train_loss:1.0600 lr:0.0003000000 time/step:187.80s
299
+ [2025-11-29 04:34:17,142] - step:255/900 train_loss:1.0544 lr:0.0003000000 time/step:187.23s
300
+ [2025-11-29 04:37:23,181] - step:256/900 train_loss:1.0961 lr:0.0003000000 time/step:186.03s
301
+ [2025-11-29 04:40:31,175] - step:257/900 train_loss:1.0838 lr:0.0003000000 time/step:187.99s
302
+ [2025-11-29 04:43:37,155] - step:258/900 train_loss:1.1142 lr:0.0003000000 time/step:185.74s
303
+ [2025-11-29 04:46:41,531] - step:259/900 train_loss:1.0784 lr:0.0003000000 time/step:184.36s
304
+ [2025-11-29 04:49:47,139] - step:260/900 train_loss:1.0548 lr:0.0003000000 time/step:185.61s
305
+ [2025-11-29 04:52:51,373] - step:261/900 train_loss:1.0670 lr:0.0003000000 time/step:184.18s
306
+ [2025-11-29 04:55:56,540] - step:262/900 train_loss:1.0790 lr:0.0003000000 time/step:185.16s
307
+ [2025-11-29 04:59:03,662] - step:263/900 train_loss:1.0758 lr:0.0003000000 time/step:187.12s
308
+ [2025-11-29 05:02:08,811] - step:264/900 train_loss:1.0945 lr:0.0003000000 time/step:185.14s
309
+ [2025-11-29 05:05:13,852] - step:265/900 train_loss:1.0733 lr:0.0003000000 time/step:185.03s
310
+ [2025-11-29 05:08:20,825] - step:266/900 train_loss:1.0854 lr:0.0003000000 time/step:186.97s
311
+ [2025-11-29 05:11:25,639] - step:267/900 train_loss:1.0816 lr:0.0003000000 time/step:184.80s
312
+ [2025-11-29 05:14:31,022] - step:268/900 train_loss:1.0670 lr:0.0003000000 time/step:185.35s
313
+ [2025-11-29 05:17:35,585] - step:269/900 train_loss:1.0892 lr:0.0003000000 time/step:184.33s
314
+ [2025-11-29 05:20:42,015] - step:270/900 train_loss:1.0245 lr:0.0003000000 time/step:186.43s
315
+ [2025-11-29 05:23:46,422] - step:271/900 train_loss:1.0735 lr:0.0003000000 time/step:184.37s
316
+ [2025-11-29 05:26:50,452] - step:272/900 train_loss:1.0714 lr:0.0003000000 time/step:184.01s
317
+ [2025-11-29 05:29:56,149] - step:273/900 train_loss:1.0769 lr:0.0003000000 time/step:185.68s
318
+ [2025-11-29 05:32:59,582] - step:274/900 train_loss:1.0265 lr:0.0003000000 time/step:183.40s
319
+ [2025-11-29 05:36:04,909] - step:275/900 train_loss:1.0510 lr:0.0003000000 time/step:185.31s
320
+ [2025-11-29 05:39:12,005] - step:276/900 train_loss:1.0753 lr:0.0003000000 time/step:187.07s
321
+ [2025-11-29 05:42:16,993] - step:277/900 train_loss:1.0582 lr:0.0003000000 time/step:184.93s
322
+ [2025-11-29 05:45:22,003] - step:278/900 train_loss:1.0717 lr:0.0003000000 time/step:185.00s
323
+ [2025-11-29 05:48:28,179] - step:279/900 train_loss:1.0676 lr:0.0003000000 time/step:186.16s
324
+ [2025-11-29 05:51:33,621] - step:280/900 train_loss:1.0595 lr:0.0003000000 time/step:185.43s
325
+ [2025-11-29 05:54:38,325] - step:281/900 train_loss:1.0585 lr:0.0003000000 time/step:184.68s
326
+ [2025-11-29 05:57:43,757] - step:282/900 train_loss:1.0949 lr:0.0003000000 time/step:185.43s
327
+ [2025-11-29 06:00:50,769] - step:283/900 train_loss:1.0682 lr:0.0003000000 time/step:187.01s
328
+ [2025-11-29 06:03:55,483] - step:284/900 train_loss:1.0756 lr:0.0003000000 time/step:184.69s
329
+ [2025-11-29 06:07:00,263] - step:285/900 train_loss:1.0693 lr:0.0003000000 time/step:184.77s
330
+ [2025-11-29 06:10:07,073] - step:286/900 train_loss:1.0734 lr:0.0003000000 time/step:186.81s
331
+ [2025-11-29 06:13:12,527] - step:287/900 train_loss:1.0729 lr:0.0003000000 time/step:185.42s
332
+ [2025-11-29 06:16:17,678] - step:288/900 train_loss:1.0483 lr:0.0003000000 time/step:185.12s
333
+ [2025-11-29 06:19:24,289] - step:289/900 train_loss:1.0590 lr:0.0003000000 time/step:186.60s
334
+ [2025-11-29 06:22:30,122] - step:290/900 train_loss:1.0687 lr:0.0003000000 time/step:185.81s
335
+ [2025-11-29 06:25:35,642] - step:291/900 train_loss:1.0612 lr:0.0003000000 time/step:185.50s
336
+ [2025-11-29 06:28:42,491] - step:292/900 train_loss:1.0357 lr:0.0003000000 time/step:186.85s
337
+ [2025-11-29 06:31:49,725] - step:293/900 train_loss:1.0708 lr:0.0003000000 time/step:187.22s
338
+ [2025-11-29 06:34:55,796] - step:294/900 train_loss:1.0707 lr:0.0003000000 time/step:186.05s
339
+ [2025-11-29 06:38:00,778] - step:295/900 train_loss:1.0776 lr:0.0003000000 time/step:184.98s
340
+ [2025-11-29 06:41:07,189] - step:296/900 train_loss:1.0576 lr:0.0003000000 time/step:186.41s
341
+ [2025-11-29 06:44:11,733] - step:297/900 train_loss:1.0260 lr:0.0003000000 time/step:184.49s
342
+ [2025-11-29 06:47:15,871] - step:298/900 train_loss:1.0749 lr:0.0003000000 time/step:184.12s
343
+ [2025-11-29 06:50:21,808] - step:299/900 train_loss:1.0567 lr:0.0003000000 time/step:185.93s
344
+ [2025-11-29 06:53:27,979] - step:300/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@300.pt...
345
+ [2025-11-29 06:53:27,992] - step:300/900 train_loss:1.0667 lr:0.0003000000 time/step:184.41s
346
+ [2025-11-29 06:56:33,223] - step:301/900 train_loss:1.0596 lr:0.0003000000 time/step:185.22s
347
+ [2025-11-29 06:59:39,570] - step:302/900 train_loss:1.0485 lr:0.0003000000 time/step:186.34s
348
+ [2025-11-29 07:02:44,495] - step:303/900 train_loss:1.0444 lr:0.0003000000 time/step:184.92s
349
+ [2025-11-29 07:05:48,939] - step:304/900 train_loss:1.0787 lr:0.0003000000 time/step:184.42s
350
+ [2025-11-29 07:08:55,065] - step:305/900 train_loss:1.0548 lr:0.0003000000 time/step:186.12s
351
+ [2025-11-29 07:12:03,934] - step:306/900 train_loss:1.0604 lr:0.0003000000 time/step:188.86s
352
+ [2025-11-29 07:15:10,343] - step:307/900 train_loss:1.0368 lr:0.0003000000 time/step:186.37s
353
+ [2025-11-29 07:18:15,509] - step:308/900 train_loss:1.0500 lr:0.0003000000 time/step:185.15s
354
+ [2025-11-29 07:21:22,280] - step:309/900 train_loss:1.0519 lr:0.0003000000 time/step:186.76s
355
+ [2025-11-29 07:24:27,690] - step:310/900 train_loss:1.0396 lr:0.0003000000 time/step:185.39s
356
+ [2025-11-29 07:27:32,558] - step:311/900 train_loss:1.0199 lr:0.0003000000 time/step:184.86s
357
+ [2025-11-29 07:30:39,398] - step:312/900 train_loss:1.0318 lr:0.0003000000 time/step:186.83s
358
+ [2025-11-29 07:33:43,716] - step:313/900 train_loss:1.0245 lr:0.0003000000 time/step:184.27s
359
+ [2025-11-29 07:36:48,934] - step:314/900 train_loss:1.0550 lr:0.0003000000 time/step:185.21s
360
+ [2025-11-29 07:39:55,818] - step:315/900 train_loss:1.0384 lr:0.0003000000 time/step:186.88s
361
+ [2025-11-29 07:43:00,630] - step:316/900 train_loss:1.0352 lr:0.0003000000 time/step:184.79s
362
+ [2025-11-29 07:46:05,765] - step:317/900 train_loss:1.0406 lr:0.0003000000 time/step:185.12s
363
+ [2025-11-29 07:49:10,421] - step:318/900 train_loss:1.0438 lr:0.0003000000 time/step:184.64s
364
+ [2025-11-29 07:52:16,687] - step:319/900 train_loss:1.0463 lr:0.0003000000 time/step:186.26s
365
+ [2025-11-29 07:55:21,390] - step:320/900 train_loss:1.0608 lr:0.0003000000 time/step:184.68s
366
+ [2025-11-29 07:58:26,121] - step:321/900 train_loss:1.0704 lr:0.0003000000 time/step:184.70s
367
+ [2025-11-29 08:01:33,062] - step:322/900 train_loss:1.0459 lr:0.0003000000 time/step:186.94s
368
+ [2025-11-29 08:04:39,120] - step:323/900 train_loss:1.0463 lr:0.0003000000 time/step:185.86s
369
+ [2025-11-29 08:07:44,864] - step:324/900 train_loss:1.0497 lr:0.0003000000 time/step:185.73s
370
+ [2025-11-29 08:10:51,704] - step:325/900 train_loss:1.0295 lr:0.0003000000 time/step:186.82s
371
+ [2025-11-29 08:13:56,466] - step:326/900 train_loss:1.0555 lr:0.0003000000 time/step:184.73s
372
+ [2025-11-29 08:17:01,957] - step:327/900 train_loss:1.0380 lr:0.0003000000 time/step:185.49s
373
+ [2025-11-29 08:20:08,723] - step:328/900 train_loss:1.0256 lr:0.0003000000 time/step:186.75s
374
+ [2025-11-29 08:23:14,378] - step:329/900 train_loss:1.0418 lr:0.0003000000 time/step:185.64s
375
+ [2025-11-29 08:26:20,417] - step:330/900 train_loss:1.0660 lr:0.0003000000 time/step:186.01s
376
+ [2025-11-29 08:29:26,557] - step:331/900 train_loss:1.0481 lr:0.0003000000 time/step:186.12s
377
+ [2025-11-29 08:32:43,853] - step:332/900 train_loss:1.0370 lr:0.0003000000 time/step:197.25s
378
+ [2025-11-29 08:35:47,902] - step:333/900 train_loss:1.0556 lr:0.0003000000 time/step:184.02s
379
+ [2025-11-29 08:38:52,834] - step:334/900 train_loss:1.0512 lr:0.0003000000 time/step:184.93s
380
+ [2025-11-29 08:41:58,814] - step:335/900 train_loss:1.0432 lr:0.0003000000 time/step:185.95s
381
+ [2025-11-29 08:45:02,340] - step:336/900 train_loss:1.0165 lr:0.0003000000 time/step:183.51s
382
+ [2025-11-29 08:48:06,475] - step:337/900 train_loss:1.0600 lr:0.0003000000 time/step:184.12s
383
+ [2025-11-29 08:51:14,075] - step:338/900 train_loss:1.0304 lr:0.0003000000 time/step:187.60s
384
+ [2025-11-29 08:54:18,478] - step:339/900 train_loss:1.0187 lr:0.0003000000 time/step:184.37s
385
+ [2025-11-29 08:57:23,665] - step:340/900 train_loss:1.0326 lr:0.0003000000 time/step:185.18s
386
+ [2025-11-29 09:00:30,230] - step:341/900 train_loss:1.0415 lr:0.0003000000 time/step:186.56s
387
+ [2025-11-29 09:03:35,007] - step:342/900 train_loss:1.0413 lr:0.0003000000 time/step:184.75s
388
+ [2025-11-29 09:06:39,120] - step:343/900 train_loss:1.0377 lr:0.0003000000 time/step:184.10s
389
+ [2025-11-29 09:09:43,682] - step:344/900 train_loss:1.0266 lr:0.0003000000 time/step:184.56s
390
+ [2025-11-29 09:12:50,738] - step:345/900 train_loss:1.0305 lr:0.0003000000 time/step:187.04s
391
+ [2025-11-29 09:15:54,975] - step:346/900 train_loss:1.0238 lr:0.0003000000 time/step:184.22s
392
+ [2025-11-29 09:18:59,184] - step:347/900 train_loss:1.0470 lr:0.0003000000 time/step:184.20s
393
+ [2025-11-29 09:22:05,583] - step:348/900 train_loss:1.0343 lr:0.0003000000 time/step:186.39s
394
+ [2025-11-29 09:25:09,502] - step:349/900 train_loss:1.0429 lr:0.0003000000 time/step:183.90s
395
+ [2025-11-29 09:28:14,785] - step:350/900 train_loss:1.0173 lr:0.0003000000 time/step:185.28s
396
+ [2025-11-29 09:31:22,664] - step:351/900 train_loss:1.0260 lr:0.0003000000 time/step:187.87s
397
+ [2025-11-29 09:34:27,994] - step:352/900 train_loss:1.0412 lr:0.0003000000 time/step:185.27s
398
+ [2025-11-29 09:37:33,386] - step:353/900 train_loss:1.0051 lr:0.0003000000 time/step:185.37s
399
+ [2025-11-29 09:40:39,936] - step:354/900 train_loss:1.0386 lr:0.0003000000 time/step:186.55s
400
+ [2025-11-29 09:43:45,796] - step:355/900 train_loss:1.0317 lr:0.0003000000 time/step:185.85s
401
+ [2025-11-29 09:46:51,082] - step:356/900 train_loss:1.0060 lr:0.0003000000 time/step:185.26s
402
+ [2025-11-29 09:49:56,919] - step:357/900 train_loss:1.0267 lr:0.0003000000 time/step:185.82s
403
+ [2025-11-29 09:53:05,845] - step:358/900 train_loss:1.0586 lr:0.0003000000 time/step:188.92s
404
+ [2025-11-29 09:56:13,021] - step:359/900 train_loss:1.0340 lr:0.0003000000 time/step:187.15s
405
+ [2025-11-29 09:59:19,033] - step:360/900 train_loss:1.0385 lr:0.0003000000 time/step:186.00s
406
+ [2025-11-29 10:02:25,949] - step:361/900 train_loss:1.0036 lr:0.0003000000 time/step:186.84s
407
+ [2025-11-29 10:05:30,167] - step:362/900 train_loss:1.0181 lr:0.0003000000 time/step:184.18s
408
+ [2025-11-29 10:08:34,860] - step:363/900 train_loss:1.0245 lr:0.0003000000 time/step:184.69s
409
+ [2025-11-29 10:11:40,819] - step:364/900 train_loss:1.0310 lr:0.0003000000 time/step:185.92s
410
+ [2025-11-29 10:14:44,430] - step:365/900 train_loss:1.0431 lr:0.0003000000 time/step:183.59s
411
+ [2025-11-29 10:17:49,210] - step:366/900 train_loss:1.0010 lr:0.0003000000 time/step:184.77s
412
+ [2025-11-29 10:20:56,812] - step:367/900 train_loss:1.0278 lr:0.0003000000 time/step:187.59s
413
+ [2025-11-29 10:24:03,874] - step:368/900 train_loss:1.0450 lr:0.0003000000 time/step:187.04s
414
+ [2025-11-29 10:27:08,644] - step:369/900 train_loss:1.0187 lr:0.0003000000 time/step:184.76s
415
+ [2025-11-29 10:30:12,932] - step:370/900 train_loss:1.0198 lr:0.0003000000 time/step:184.28s
416
+ [2025-11-29 10:33:19,131] - step:371/900 train_loss:1.0267 lr:0.0003000000 time/step:186.19s
417
+ [2025-11-29 10:36:23,611] - step:372/900 train_loss:1.0050 lr:0.0003000000 time/step:184.44s
418
+ [2025-11-29 10:39:27,504] - step:373/900 train_loss:1.0285 lr:0.0003000000 time/step:183.89s
419
+ [2025-11-29 10:42:34,817] - step:374/900 train_loss:1.0273 lr:0.0003000000 time/step:187.31s
420
+ [2025-11-29 10:45:39,564] - step:375/900 train_loss:1.0304 lr:0.0003000000 time/step:184.73s
421
+ [2025-11-29 10:48:44,710] - step:376/900 train_loss:1.0118 lr:0.0003000000 time/step:185.13s
422
+ [2025-11-29 10:51:52,055] - step:377/900 train_loss:1.0109 lr:0.0003000000 time/step:187.34s
423
+ [2025-11-29 10:54:57,418] - step:378/900 train_loss:1.0240 lr:0.0003000000 time/step:185.34s
424
+ [2025-11-29 10:58:02,656] - step:379/900 train_loss:0.9999 lr:0.0003000000 time/step:185.22s
425
+ [2025-11-29 11:01:08,733] - step:380/900 train_loss:1.0321 lr:0.0003000000 time/step:186.07s
426
+ [2025-11-29 11:04:14,600] - step:381/900 train_loss:1.0227 lr:0.0003000000 time/step:185.85s
427
+ [2025-11-29 11:07:19,868] - step:382/900 train_loss:1.0266 lr:0.0003000000 time/step:185.24s
428
+ [2025-11-29 11:10:25,377] - step:383/900 train_loss:1.0351 lr:0.0003000000 time/step:185.51s
429
+ [2025-11-29 11:13:31,325] - step:384/900 train_loss:1.0345 lr:0.0003000000 time/step:185.94s
430
+ [2025-11-29 11:16:37,117] - step:385/900 train_loss:1.0095 lr:0.0003000000 time/step:185.74s
431
+ [2025-11-29 11:19:42,673] - step:386/900 train_loss:1.0084 lr:0.0003000000 time/step:185.53s
432
+ [2025-11-29 11:22:49,798] - step:387/900 train_loss:1.0363 lr:0.0003000000 time/step:187.11s
433
+ [2025-11-29 11:25:54,407] - step:388/900 train_loss:1.0115 lr:0.0003000000 time/step:184.58s
434
+ [2025-11-29 11:29:01,044] - step:389/900 train_loss:1.0391 lr:0.0003000000 time/step:186.63s
435
+ [2025-11-29 11:32:08,861] - step:390/900 train_loss:1.0325 lr:0.0003000000 time/step:187.81s
436
+ [2025-11-29 11:35:13,748] - step:391/900 train_loss:1.0275 lr:0.0003000000 time/step:184.87s
437
+ [2025-11-29 11:38:19,164] - step:392/900 train_loss:1.0071 lr:0.0003000000 time/step:185.41s
438
+ [2025-11-29 11:41:26,071] - step:393/900 train_loss:1.0140 lr:0.0003000000 time/step:186.89s
439
+ [2025-11-29 11:44:30,887] - step:394/900 train_loss:1.0238 lr:0.0003000000 time/step:184.80s
440
+ [2025-11-29 11:47:36,554] - step:395/900 train_loss:1.0223 lr:0.0003000000 time/step:185.63s
441
+ [2025-11-29 11:50:42,929] - step:396/900 train_loss:1.0248 lr:0.0003000000 time/step:186.36s
442
+ [2025-11-29 11:53:49,516] - step:397/900 train_loss:1.0155 lr:0.0003000000 time/step:186.58s
443
+ [2025-11-29 11:56:55,065] - step:398/900 train_loss:1.0266 lr:0.0003000000 time/step:185.52s
444
+ [2025-11-29 12:00:00,180] - step:399/900 train_loss:0.9997 lr:0.0003000000 time/step:185.11s
445
+ [2025-11-29 12:03:08,327] - step:400/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@400.pt...
446
+ [2025-11-29 12:03:08,330] - step:400/900 train_loss:1.0379 lr:0.0003000000 time/step:186.52s
447
+ [2025-11-29 12:06:13,170] - step:401/900 train_loss:1.0278 lr:0.0003000000 time/step:184.80s
448
+ [2025-11-29 12:09:18,331] - step:402/900 train_loss:0.9898 lr:0.0003000000 time/step:185.15s
449
+ [2025-11-29 12:12:24,948] - step:403/900 train_loss:0.9872 lr:0.0003000000 time/step:186.60s
450
+ [2025-11-29 12:15:30,939] - step:404/900 train_loss:1.0125 lr:0.0003000000 time/step:185.98s
451
+ [2025-11-29 12:18:37,715] - step:405/900 train_loss:1.0320 lr:0.0003000000 time/step:186.75s
452
+ [2025-11-29 12:21:43,437] - step:406/900 train_loss:1.0104 lr:0.0003000000 time/step:185.67s
453
+ [2025-11-29 12:24:46,661] - step:407/900 train_loss:1.0245 lr:0.0003000000 time/step:183.20s
454
+ [2025-11-29 12:27:52,506] - step:408/900 train_loss:1.0147 lr:0.0003000000 time/step:185.84s
455
+ [2025-11-29 12:30:57,358] - step:409/900 train_loss:1.0103 lr:0.0003000000 time/step:184.84s
456
+ [2025-11-29 12:34:03,720] - step:410/900 train_loss:0.9781 lr:0.0003000000 time/step:186.36s
457
+ [2025-11-29 12:37:13,118] - step:411/900 train_loss:0.9906 lr:0.0003000000 time/step:189.35s
458
+ [2025-11-29 12:40:19,894] - step:412/900 train_loss:1.0237 lr:0.0003000000 time/step:186.75s
459
+ [2025-11-29 12:43:25,422] - step:413/900 train_loss:1.0114 lr:0.0003000000 time/step:185.52s
460
+ [2025-11-29 12:46:30,579] - step:414/900 train_loss:1.0147 lr:0.0003000000 time/step:184.99s
461
+ [2025-11-29 12:49:38,876] - step:415/900 train_loss:1.0150 lr:0.0003000000 time/step:188.29s
462
+ [2025-11-29 12:52:43,633] - step:416/900 train_loss:1.0239 lr:0.0003000000 time/step:184.73s
463
+ [2025-11-29 12:55:48,060] - step:417/900 train_loss:1.0036 lr:0.0003000000 time/step:184.39s
464
+ [2025-11-29 12:58:55,753] - step:418/900 train_loss:1.0140 lr:0.0003000000 time/step:187.68s
465
+ [2025-11-29 13:02:00,431] - step:419/900 train_loss:1.0039 lr:0.0003000000 time/step:184.66s
466
+ [2025-11-29 13:05:05,089] - step:420/900 train_loss:1.0203 lr:0.0003000000 time/step:184.64s
467
+ [2025-11-29 13:08:12,316] - step:421/900 train_loss:1.0304 lr:0.0003000000 time/step:187.22s
468
+ [2025-11-29 13:11:17,410] - step:422/900 train_loss:1.0034 lr:0.0003000000 time/step:185.08s
469
+ [2025-11-29 13:14:22,416] - step:423/900 train_loss:1.0279 lr:0.0003000000 time/step:185.00s
470
+ [2025-11-29 13:17:27,732] - step:424/900 train_loss:1.0213 lr:0.0003000000 time/step:185.29s
471
+ [2025-11-29 13:20:34,573] - step:425/900 train_loss:0.9987 lr:0.0003000000 time/step:186.70s
472
+ [2025-11-29 13:23:39,421] - step:426/900 train_loss:0.9673 lr:0.0003000000 time/step:184.84s
473
+ [2025-11-29 13:26:44,176] - step:427/900 train_loss:1.0108 lr:0.0003000000 time/step:184.74s
474
+ [2025-11-29 13:29:52,082] - step:428/900 train_loss:1.0243 lr:0.0003000000 time/step:187.87s
475
+ [2025-11-29 13:32:56,984] - step:429/900 train_loss:0.9843 lr:0.0003000000 time/step:184.88s
476
+ [2025-11-29 13:36:01,659] - step:430/900 train_loss:1.0269 lr:0.0003000000 time/step:184.66s
477
+ [2025-11-29 13:39:09,363] - step:431/900 train_loss:1.0047 lr:0.0003000000 time/step:187.70s
478
+ [2025-11-29 13:42:14,007] - step:432/900 train_loss:0.9957 lr:0.0003000000 time/step:184.63s
479
+ [2025-11-29 13:45:17,936] - step:433/900 train_loss:1.0006 lr:0.0003000000 time/step:183.92s
480
+ [2025-11-29 13:48:23,683] - step:434/900 train_loss:1.0080 lr:0.0003000000 time/step:185.74s
481
+ [2025-11-29 13:51:28,718] - step:435/900 train_loss:1.0033 lr:0.0003000000 time/step:185.01s
482
+ [2025-11-29 13:54:33,479] - step:436/900 train_loss:1.0077 lr:0.0003000000 time/step:184.74s
483
+ [2025-11-29 13:57:38,454] - step:437/900 train_loss:0.9913 lr:0.0003000000 time/step:184.96s
484
+ [2025-11-29 14:00:45,973] - step:438/900 train_loss:1.0221 lr:0.0003000000 time/step:187.50s
485
+ [2025-11-29 14:03:50,970] - step:439/900 train_loss:1.0017 lr:0.0003000000 time/step:184.98s
486
+ [2025-11-29 14:06:56,103] - step:440/900 train_loss:0.9966 lr:0.0003000000 time/step:185.11s
487
+ [2025-11-29 14:10:03,916] - step:441/900 train_loss:1.0023 lr:0.0003000000 time/step:187.81s
488
+ [2025-11-29 14:13:09,854] - step:442/900 train_loss:1.0154 lr:0.0003000000 time/step:185.93s
489
+ [2025-11-29 14:16:13,300] - step:443/900 train_loss:0.9993 lr:0.0003000000 time/step:183.43s
490
+ [2025-11-29 14:19:19,989] - step:444/900 train_loss:1.0085 lr:0.0003000000 time/step:186.68s
491
+ [2025-11-29 14:22:23,752] - step:445/900 train_loss:0.9978 lr:0.0003000000 time/step:183.75s
492
+ [2025-11-29 14:25:27,620] - step:446/900 train_loss:1.0148 lr:0.0003000000 time/step:183.84s
493
+ [2025-11-29 14:28:33,765] - step:447/900 train_loss:0.9874 lr:0.0003000000 time/step:186.14s
494
+ [2025-11-29 14:31:37,881] - step:448/900 train_loss:1.0202 lr:0.0003000000 time/step:184.10s
495
+ [2025-11-29 14:34:41,135] - step:449/900 train_loss:0.9902 lr:0.0003000000 time/step:183.23s
496
+ [2025-11-29 14:37:45,361] - step:450/900 train_loss:1.0036 lr:0.0003000000 time/step:184.22s
497
+ [2025-11-29 14:40:53,203] - step:451/900 train_loss:1.0127 lr:0.0003000000 time/step:187.83s
498
+ [2025-11-29 14:43:58,011] - step:452/900 train_loss:1.0339 lr:0.0003000000 time/step:184.77s
499
+ [2025-11-29 14:47:02,348] - step:453/900 train_loss:0.9934 lr:0.0003000000 time/step:184.30s
500
+ [2025-11-29 14:50:10,497] - step:454/900 train_loss:1.0175 lr:0.0003000000 time/step:188.14s
501
+ [2025-11-29 14:53:14,572] - step:455/900 train_loss:1.0011 lr:0.0003000000 time/step:184.06s
502
+ [2025-11-29 14:56:19,257] - step:456/900 train_loss:1.0329 lr:0.0003000000 time/step:184.66s
503
+ [2025-11-29 14:59:26,311] - step:457/900 train_loss:0.9970 lr:0.0003000000 time/step:187.05s
504
+ [2025-11-29 15:02:31,228] - step:458/900 train_loss:0.9849 lr:0.0003000000 time/step:184.91s
505
+ [2025-11-29 15:05:35,912] - step:459/900 train_loss:1.0443 lr:0.0003000000 time/step:184.67s
506
+ [2025-11-29 15:08:44,234] - step:460/900 train_loss:1.0166 lr:0.0003000000 time/step:188.30s
507
+ [2025-11-29 15:11:49,196] - step:461/900 train_loss:0.9857 lr:0.0003000000 time/step:184.94s
508
+ [2025-11-29 15:14:54,073] - step:462/900 train_loss:0.9887 lr:0.0003000000 time/step:184.87s
509
+ [2025-11-29 15:18:01,015] - step:463/900 train_loss:1.0142 lr:0.0003000000 time/step:186.91s
510
+ [2025-11-29 15:21:10,436] - step:464/900 train_loss:1.0084 lr:0.0003000000 time/step:189.42s
511
+ [2025-11-29 15:24:17,825] - step:465/900 train_loss:1.0079 lr:0.0003000000 time/step:187.37s
512
+ [2025-11-29 15:27:21,991] - step:466/900 train_loss:0.9989 lr:0.0003000000 time/step:184.15s
513
+ [2025-11-29 15:30:29,430] - step:467/900 train_loss:1.0027 lr:0.0003000000 time/step:187.42s
514
+ [2025-11-29 15:33:34,038] - step:468/900 train_loss:0.9864 lr:0.0003000000 time/step:184.56s
515
+ [2025-11-29 15:36:38,606] - step:469/900 train_loss:0.9922 lr:0.0003000000 time/step:184.56s
516
+ [2025-11-29 15:39:50,010] - step:470/900 train_loss:1.0046 lr:0.0003000000 time/step:191.39s
517
+ [2025-11-29 15:42:54,426] - step:471/900 train_loss:0.9947 lr:0.0003000000 time/step:184.39s
518
+ [2025-11-29 15:45:58,386] - step:472/900 train_loss:0.9856 lr:0.0003000000 time/step:183.94s
519
+ [2025-11-29 15:49:06,443] - step:473/900 train_loss:1.0102 lr:0.0003000000 time/step:188.03s
520
+ [2025-11-29 15:52:11,651] - step:474/900 train_loss:0.9815 lr:0.0003000000 time/step:185.17s
521
+ [2025-11-29 15:55:16,024] - step:475/900 train_loss:0.9870 lr:0.0003000000 time/step:184.37s
522
+ [2025-11-29 15:58:20,934] - step:476/900 train_loss:0.9902 lr:0.0003000000 time/step:184.90s
523
+ [2025-11-29 16:01:34,770] - step:477/900 train_loss:1.0044 lr:0.0003000000 time/step:193.83s
524
+ [2025-11-29 16:04:40,969] - step:478/900 train_loss:0.9706 lr:0.0003000000 time/step:186.18s
525
+ [2025-11-29 16:07:46,966] - step:479/900 train_loss:0.9861 lr:0.0003000000 time/step:185.98s
526
+ [2025-11-29 16:11:03,013] - step:480/900 train_loss:1.0035 lr:0.0003000000 time/step:196.03s
527
+ [2025-11-29 16:14:06,891] - step:481/900 train_loss:0.9746 lr:0.0003000000 time/step:183.84s
528
+ [2025-11-29 16:17:10,864] - step:482/900 train_loss:0.9883 lr:0.0003000000 time/step:183.95s
529
+ [2025-11-29 16:20:17,244] - step:483/900 train_loss:1.0245 lr:0.0003000000 time/step:186.37s
530
+ [2025-11-29 16:23:21,291] - step:484/900 train_loss:1.0193 lr:0.0003000000 time/step:184.03s
531
+ [2025-11-29 16:26:24,937] - step:485/900 train_loss:0.9953 lr:0.0003000000 time/step:183.63s
532
+ [2025-11-29 16:29:32,575] - step:486/900 train_loss:0.9787 lr:0.0003000000 time/step:187.63s
533
+ [2025-11-29 16:32:37,230] - step:487/900 train_loss:0.9812 lr:0.0003000000 time/step:184.64s
534
+ [2025-11-29 16:35:41,884] - step:488/900 train_loss:0.9911 lr:0.0003000000 time/step:184.65s
535
+ [2025-11-29 16:38:47,753] - step:489/900 train_loss:0.9665 lr:0.0003000000 time/step:185.84s
536
+ [2025-11-29 16:41:53,739] - step:490/900 train_loss:0.9663 lr:0.0003000000 time/step:185.97s
537
+ [2025-11-29 16:44:58,613] - step:491/900 train_loss:1.0147 lr:0.0003000000 time/step:184.87s
538
+ [2025-11-29 16:48:03,370] - step:492/900 train_loss:1.0107 lr:0.0003000000 time/step:184.74s
539
+ [2025-11-29 16:51:11,045] - step:493/900 train_loss:0.9999 lr:0.0003000000 time/step:187.65s
540
+ [2025-11-29 16:54:15,696] - step:494/900 train_loss:0.9875 lr:0.0003000000 time/step:184.64s
541
+ [2025-11-29 16:57:20,229] - step:495/900 train_loss:0.9990 lr:0.0003000000 time/step:184.53s
542
+ [2025-11-29 17:00:26,562] - step:496/900 train_loss:0.9889 lr:0.0003000000 time/step:186.31s
543
+ [2025-11-29 17:03:30,547] - step:497/900 train_loss:0.9835 lr:0.0003000000 time/step:183.97s
544
+ [2025-11-29 17:06:34,456] - step:498/900 train_loss:1.0062 lr:0.0003000000 time/step:183.89s
545
+ [2025-11-29 17:09:40,945] - step:499/900 train_loss:0.9785 lr:0.0003000000 time/step:186.48s
546
+ [2025-11-29 17:12:49,048] - step:500/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@500.pt...
547
+ [2025-11-29 17:12:49,051] - step:500/900 train_loss:1.0054 lr:0.0003000000 time/step:186.42s
548
+ [2025-11-29 17:15:53,725] - step:501/900 train_loss:0.9961 lr:0.0003000000 time/step:184.67s
549
+ [2025-11-29 17:18:59,020] - step:502/900 train_loss:1.0013 lr:0.0003000000 time/step:185.28s
550
+ [2025-11-29 17:22:06,405] - step:503/900 train_loss:0.9746 lr:0.0003000000 time/step:187.35s
551
+ [2025-11-29 17:25:11,190] - step:504/900 train_loss:0.9977 lr:0.0003000000 time/step:184.76s
552
+ [2025-11-29 17:28:16,344] - step:505/900 train_loss:0.9737 lr:0.0003000000 time/step:185.15s
553
+ [2025-11-29 17:31:24,506] - step:506/900 train_loss:1.0010 lr:0.0003000000 time/step:188.14s
554
+ [2025-11-29 17:34:29,492] - step:507/900 train_loss:0.9852 lr:0.0003000000 time/step:184.96s
555
+ [2025-11-29 17:37:34,522] - step:508/900 train_loss:0.9887 lr:0.0003000000 time/step:185.00s
556
+ [2025-11-29 17:40:41,236] - step:509/900 train_loss:0.9830 lr:0.0003000000 time/step:186.70s
557
+ [2025-11-29 17:43:46,307] - step:510/900 train_loss:0.9844 lr:0.0003000000 time/step:185.05s
558
+ [2025-11-29 17:46:49,527] - step:511/900 train_loss:0.9718 lr:0.0003000000 time/step:183.20s
559
+ [2025-11-29 17:49:52,170] - step:512/900 train_loss:0.9866 lr:0.0003000000 time/step:182.64s
560
+ [2025-11-29 17:52:55,148] - step:513/900 train_loss:1.0106 lr:0.0003000000 time/step:182.96s
561
+ [2025-11-29 17:55:59,252] - step:514/900 train_loss:0.9629 lr:0.0003000000 time/step:184.09s
562
+ [2025-11-29 17:59:04,675] - step:515/900 train_loss:1.0048 lr:0.0003000000 time/step:185.41s
563
+ [2025-11-29 18:02:13,006] - step:516/900 train_loss:0.9964 lr:0.0003000000 time/step:188.32s
564
+ [2025-11-29 18:05:16,855] - step:517/900 train_loss:1.0057 lr:0.0003000000 time/step:183.84s
565
+ [2025-11-29 18:08:20,622] - step:518/900 train_loss:0.9859 lr:0.0003000000 time/step:183.75s
566
+ [2025-11-29 18:11:26,793] - step:519/900 train_loss:0.9714 lr:0.0003000000 time/step:186.16s
567
+ [2025-11-29 18:14:29,889] - step:520/900 train_loss:0.9652 lr:0.0003000000 time/step:183.08s
568
+ [2025-11-29 18:17:33,349] - step:521/900 train_loss:0.9786 lr:0.0003000000 time/step:183.43s
569
+ [2025-11-29 18:20:39,275] - step:522/900 train_loss:0.9721 lr:0.0003000000 time/step:185.92s
570
+ [2025-11-29 18:23:43,034] - step:523/900 train_loss:0.9862 lr:0.0003000000 time/step:183.75s
571
+ [2025-11-29 18:26:46,732] - step:524/900 train_loss:0.9942 lr:0.0003000000 time/step:183.66s
572
+ [2025-11-29 18:29:50,749] - step:525/900 train_loss:0.9850 lr:0.0003000000 time/step:184.01s
573
+ [2025-11-29 18:32:55,277] - step:526/900 train_loss:0.9804 lr:0.0003000000 time/step:184.51s
574
+ [2025-11-29 18:36:00,371] - step:527/900 train_loss:0.9845 lr:0.0003000000 time/step:185.08s
575
+ [2025-11-29 18:39:06,966] - step:528/900 train_loss:0.9832 lr:0.0003000000 time/step:186.57s
576
+ [2025-11-29 18:42:15,798] - step:529/900 train_loss:0.9967 lr:0.0003000000 time/step:188.82s
577
+ [2025-11-29 18:45:22,445] - step:530/900 train_loss:0.9910 lr:0.0003000000 time/step:186.63s
578
+ [2025-11-29 18:48:29,542] - step:531/900 train_loss:0.9714 lr:0.0003000000 time/step:187.07s
579
+ [2025-11-29 18:51:38,501] - step:532/900 train_loss:0.9868 lr:0.0003000000 time/step:188.95s
580
+ [2025-11-29 18:54:45,821] - step:533/900 train_loss:0.9929 lr:0.0003000000 time/step:187.30s
581
+ [2025-11-29 18:57:53,314] - step:534/900 train_loss:0.9879 lr:0.0003000000 time/step:187.47s
582
+ [2025-11-29 19:01:01,583] - step:535/900 train_loss:1.0067 lr:0.0003000000 time/step:188.26s
583
+ [2025-11-29 19:04:09,493] - step:536/900 train_loss:0.9836 lr:0.0003000000 time/step:187.89s
584
+ [2025-11-29 19:07:16,734] - step:537/900 train_loss:0.9868 lr:0.0003000000 time/step:187.21s
585
+ [2025-11-29 19:10:24,993] - step:538/900 train_loss:0.9951 lr:0.0003000000 time/step:188.24s
586
+ [2025-11-29 19:13:31,431] - step:539/900 train_loss:0.9761 lr:0.0003000000 time/step:186.41s
587
+ [2025-11-29 19:16:36,819] - step:540/900 train_loss:0.9742 lr:0.0003000000 time/step:185.38s
588
+ [2025-11-29 19:19:43,799] - step:541/900 train_loss:0.9745 lr:0.0003000000 time/step:186.95s
589
+ [2025-11-29 19:22:52,532] - step:542/900 train_loss:0.9817 lr:0.0003000000 time/step:188.73s
590
+ [2025-11-29 19:25:59,216] - step:543/900 train_loss:0.9777 lr:0.0003000000 time/step:186.67s
591
+ [2025-11-29 19:29:05,849] - step:544/900 train_loss:0.9960 lr:0.0003000000 time/step:186.61s
592
+ [2025-11-29 19:32:14,558] - step:545/900 train_loss:0.9811 lr:0.0003000000 time/step:188.70s
593
+ [2025-11-29 19:35:21,607] - step:546/900 train_loss:0.9882 lr:0.0003000000 time/step:187.02s
594
+ [2025-11-29 19:38:28,518] - step:547/900 train_loss:0.9938 lr:0.0003000000 time/step:186.88s
595
+ [2025-11-29 19:41:37,225] - step:548/900 train_loss:0.9407 lr:0.0003000000 time/step:188.70s
596
+ [2025-11-29 19:44:44,063] - step:549/900 train_loss:0.9774 lr:0.0003000000 time/step:186.81s
597
+ [2025-11-29 19:47:50,816] - step:550/900 train_loss:0.9913 lr:0.0003000000 time/step:186.73s
598
+ [2025-11-29 19:50:58,844] - step:551/900 train_loss:0.9948 lr:0.0003000000 time/step:188.02s
599
+ [2025-11-29 19:54:06,212] - step:552/900 train_loss:0.9696 lr:0.0003000000 time/step:187.35s
600
+ [2025-11-29 19:57:12,084] - step:553/900 train_loss:0.9706 lr:0.0003000000 time/step:185.85s
601
+ [2025-11-29 20:00:18,128] - step:554/900 train_loss:0.9871 lr:0.0003000000 time/step:186.03s
602
+ [2025-11-29 20:03:26,623] - step:555/900 train_loss:0.9930 lr:0.0003000000 time/step:188.48s
603
+ [2025-11-29 20:06:33,230] - step:556/900 train_loss:0.9752 lr:0.0003000000 time/step:186.55s
604
+ [2025-11-29 20:09:39,696] - step:557/900 train_loss:0.9850 lr:0.0003000000 time/step:186.45s
605
+ [2025-11-29 20:12:48,229] - step:558/900 train_loss:0.9720 lr:0.0003000000 time/step:188.52s
606
+ [2025-11-29 20:15:53,987] - step:559/900 train_loss:0.9962 lr:0.0003000000 time/step:185.74s
607
+ [2025-11-29 20:19:00,484] - step:560/900 train_loss:0.9922 lr:0.0003000000 time/step:186.48s
608
+ [2025-11-29 20:22:09,247] - step:561/900 train_loss:0.9740 lr:0.0003000000 time/step:188.74s
609
+ [2025-11-29 20:25:16,473] - step:562/900 train_loss:0.9712 lr:0.0003000000 time/step:187.21s
610
+ [2025-11-29 20:28:23,403] - step:563/900 train_loss:0.9612 lr:0.0003000000 time/step:186.92s
611
+ [2025-11-29 20:31:30,909] - step:564/900 train_loss:0.9914 lr:0.0003000000 time/step:187.50s
612
+ [2025-11-29 20:34:38,710] - step:565/900 train_loss:0.9836 lr:0.0003000000 time/step:187.78s
613
+ [2025-11-29 20:37:45,056] - step:566/900 train_loss:0.9814 lr:0.0003000000 time/step:186.33s
614
+ [2025-11-29 20:40:51,873] - step:567/900 train_loss:0.9865 lr:0.0003000000 time/step:186.81s
615
+ [2025-11-29 20:44:00,559] - step:568/900 train_loss:0.9917 lr:0.0003000000 time/step:188.68s
616
+ [2025-11-29 20:47:07,062] - step:569/900 train_loss:0.9644 lr:0.0003000000 time/step:186.48s
617
+ [2025-11-29 20:50:13,303] - step:570/900 train_loss:0.9759 lr:0.0003000000 time/step:186.19s
618
+ [2025-11-29 20:53:21,695] - step:571/900 train_loss:0.9703 lr:0.0003000000 time/step:188.39s
619
+ [2025-11-29 20:56:29,148] - step:572/900 train_loss:0.9713 lr:0.0003000000 time/step:187.43s
620
+ [2025-11-29 20:59:35,993] - step:573/900 train_loss:0.9549 lr:0.0003000000 time/step:186.82s
621
+ [2025-11-29 21:02:44,463] - step:574/900 train_loss:0.9696 lr:0.0003000000 time/step:188.47s
622
+ [2025-11-29 21:05:51,247] - step:575/900 train_loss:0.9648 lr:0.0003000000 time/step:186.77s
623
+ [2025-11-29 21:08:57,001] - step:576/900 train_loss:0.9695 lr:0.0003000000 time/step:185.74s
624
+ [2025-11-29 21:12:03,873] - step:577/900 train_loss:0.9728 lr:0.0003000000 time/step:186.86s
625
+ [2025-11-29 21:15:10,900] - step:578/900 train_loss:0.9767 lr:0.0003000000 time/step:187.02s
626
+ [2025-11-29 21:18:14,501] - step:579/900 train_loss:0.9643 lr:0.0003000000 time/step:183.56s
627
+ [2025-11-29 21:21:16,045] - step:580/900 train_loss:0.9826 lr:0.0003000000 time/step:181.53s
628
+ [2025-11-29 21:24:19,527] - step:581/900 train_loss:0.9792 lr:0.0003000000 time/step:183.48s
629
+ [2025-11-29 21:27:25,340] - step:582/900 train_loss:0.9852 lr:0.0003000000 time/step:185.73s
630
+ [2025-11-29 21:30:32,498] - step:583/900 train_loss:0.9699 lr:0.0003000000 time/step:187.15s
631
+ [2025-11-29 21:33:40,663] - step:584/900 train_loss:0.9709 lr:0.0003000000 time/step:188.14s
632
+ [2025-11-29 21:36:47,891] - step:585/900 train_loss:0.9673 lr:0.0003000000 time/step:187.21s
633
+ [2025-11-29 21:39:54,798] - step:586/900 train_loss:0.9792 lr:0.0003000000 time/step:186.90s
634
+ [2025-11-29 21:43:04,568] - step:587/900 train_loss:0.9784 lr:0.0003000000 time/step:189.77s
635
+ [2025-11-29 21:46:11,882] - step:588/900 train_loss:0.9719 lr:0.0003000000 time/step:187.29s
636
+ [2025-11-29 21:49:18,906] - step:589/900 train_loss:0.9834 lr:0.0003000000 time/step:187.01s
637
+ [2025-11-29 21:52:25,621] - step:590/900 train_loss:0.9659 lr:0.0003000000 time/step:186.70s
638
+ [2025-11-29 21:55:31,655] - step:591/900 train_loss:0.9658 lr:0.0003000000 time/step:185.94s
639
+ [2025-11-29 21:58:38,212] - step:592/900 train_loss:0.9855 lr:0.0003000000 time/step:186.53s
640
+ [2025-11-29 22:01:44,812] - step:593/900 train_loss:0.9691 lr:0.0003000000 time/step:186.59s
641
+ [2025-11-29 22:04:51,951] - step:594/900 train_loss:0.9781 lr:0.0003000000 time/step:187.13s
642
+ [2025-11-29 22:07:57,915] - step:595/900 train_loss:0.9579 lr:0.0003000000 time/step:185.94s
643
+ [2025-11-29 22:11:04,854] - step:596/900 train_loss:0.9731 lr:0.0003000000 time/step:186.91s
644
+ [2025-11-29 22:14:13,434] - step:597/900 train_loss:0.9715 lr:0.0003000000 time/step:188.57s
645
+ [2025-11-29 22:17:20,910] - step:598/900 train_loss:0.9886 lr:0.0003000000 time/step:187.46s
646
+ [2025-11-29 22:20:27,176] - step:599/900 train_loss:0.9657 lr:0.0003000000 time/step:186.24s
647
+ [2025-11-29 22:23:34,717] - step:600/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@600.pt...
648
+ [2025-11-29 22:23:34,723] - step:600/900 train_loss:0.9532 lr:0.0003000000 time/step:185.95s
649
+ [2025-11-29 22:26:38,518] - step:601/900 train_loss:0.9535 lr:0.0003000000 time/step:183.79s
650
+ [2025-11-29 22:29:41,900] - step:602/900 train_loss:0.9374 lr:0.0003000000 time/step:183.35s
651
+ [2025-11-29 22:32:47,468] - step:603/900 train_loss:0.9662 lr:0.0003000000 time/step:185.52s
652
+ [2025-11-29 22:35:53,752] - step:604/900 train_loss:0.9587 lr:0.0003000000 time/step:186.16s
653
+ [2025-11-29 22:38:58,466] - step:605/900 train_loss:0.9739 lr:0.0003000000 time/step:184.70s
654
+ [2025-11-29 22:42:03,657] - step:606/900 train_loss:0.9563 lr:0.0003000000 time/step:185.17s
655
+ [2025-11-29 22:45:12,058] - step:607/900 train_loss:0.9584 lr:0.0003000000 time/step:188.39s
656
+ [2025-11-29 22:48:18,310] - step:608/900 train_loss:0.9694 lr:0.0003000000 time/step:186.23s
657
+ [2025-11-29 22:51:24,367] - step:609/900 train_loss:0.9681 lr:0.0003000000 time/step:186.05s
658
+ [2025-11-29 22:54:31,573] - step:610/900 train_loss:0.9582 lr:0.0003000000 time/step:187.20s
659
+ [2025-11-29 22:57:36,240] - step:611/900 train_loss:0.9781 lr:0.0003000000 time/step:184.66s
660
+ [2025-11-29 23:00:39,793] - step:612/900 train_loss:0.9707 lr:0.0003000000 time/step:183.54s
661
+ [2025-11-29 23:03:48,177] - step:613/900 train_loss:0.9626 lr:0.0003000000 time/step:188.38s
662
+ [2025-11-29 23:06:54,527] - step:614/900 train_loss:0.9525 lr:0.0003000000 time/step:186.34s
663
+ [2025-11-29 23:10:00,576] - step:615/900 train_loss:0.9825 lr:0.0003000000 time/step:186.03s
664
+ [2025-11-29 23:13:06,944] - step:616/900 train_loss:0.9648 lr:0.0003000000 time/step:186.35s
665
+ [2025-11-29 23:16:13,313] - step:617/900 train_loss:0.9833 lr:0.0003000000 time/step:186.36s
666
+ [2025-11-29 23:19:18,008] - step:618/900 train_loss:0.9619 lr:0.0003000000 time/step:184.67s
667
+ [2025-11-29 23:22:23,418] - step:619/900 train_loss:0.9681 lr:0.0003000000 time/step:185.40s
668
+ [2025-11-29 23:25:30,799] - step:620/900 train_loss:0.9705 lr:0.0003000000 time/step:187.36s
669
+ [2025-11-29 23:28:36,096] - step:621/900 train_loss:0.9884 lr:0.0003000000 time/step:185.28s
670
+ [2025-11-29 23:31:40,935] - step:622/900 train_loss:0.9623 lr:0.0003000000 time/step:184.83s
671
+ [2025-11-29 23:34:49,164] - step:623/900 train_loss:0.9781 lr:0.0003000000 time/step:188.22s
672
+ [2025-11-29 23:37:55,808] - step:624/900 train_loss:0.9558 lr:0.0003000000 time/step:186.62s
673
+ [2025-11-29 23:41:02,902] - step:625/900 train_loss:0.9641 lr:0.0003000000 time/step:187.08s
674
+ [2025-11-29 23:44:12,190] - step:626/900 train_loss:0.9631 lr:0.0003000000 time/step:189.26s
675
+ [2025-11-29 23:47:18,211] - step:627/900 train_loss:0.9820 lr:0.0003000000 time/step:185.99s
676
+ [2025-11-29 23:50:22,907] - step:628/900 train_loss:0.9647 lr:0.0003000000 time/step:184.67s
677
+ [2025-11-29 23:53:29,293] - step:629/900 train_loss:0.9504 lr:0.0003000000 time/step:186.38s
678
+ [2025-11-29 23:56:35,007] - step:630/900 train_loss:0.9845 lr:0.0003000000 time/step:185.70s
679
+ [2025-11-29 23:59:41,063] - step:631/900 train_loss:0.9710 lr:0.0003000000 time/step:186.04s
680
+ [2025-11-30 00:02:47,384] - step:632/900 train_loss:0.9673 lr:0.0003000000 time/step:186.31s
681
+ [2025-11-30 00:05:54,675] - step:633/900 train_loss:0.9644 lr:0.0003000000 time/step:187.29s
682
+ [2025-11-30 00:09:00,681] - step:634/900 train_loss:0.9751 lr:0.0003000000 time/step:185.98s
683
+ [2025-11-30 00:12:07,170] - step:635/900 train_loss:0.9427 lr:0.0003000000 time/step:186.47s
684
+ [2025-11-30 00:15:16,394] - step:636/900 train_loss:0.9941 lr:0.0003000000 time/step:189.21s
685
+ [2025-11-30 00:18:21,885] - step:637/900 train_loss:0.9627 lr:0.0003000000 time/step:185.46s
686
+ [2025-11-30 00:21:26,909] - step:638/900 train_loss:0.9713 lr:0.0003000000 time/step:185.01s
687
+ [2025-11-30 00:24:34,518] - step:639/900 train_loss:0.9477 lr:0.0003000000 time/step:187.59s
688
+ [2025-11-30 00:27:39,860] - step:640/900 train_loss:0.9413 lr:0.0003000000 time/step:185.32s
689
+ [2025-11-30 00:30:46,082] - step:641/900 train_loss:0.9583 lr:0.0003000000 time/step:186.18s
690
+ [2025-11-30 00:33:53,085] - step:642/900 train_loss:0.9927 lr:0.0003000000 time/step:186.99s
691
+ [2025-11-30 00:37:00,236] - step:643/900 train_loss:0.9658 lr:0.0003000000 time/step:187.13s
692
+ [2025-11-30 00:40:06,191] - step:644/900 train_loss:0.9532 lr:0.0003000000 time/step:185.92s
693
+ [2025-11-30 00:43:11,626] - step:645/900 train_loss:0.9510 lr:0.0003000000 time/step:185.43s
694
+ [2025-11-30 00:46:16,854] - step:646/900 train_loss:0.9572 lr:0.0003000000 time/step:185.21s
695
+ [2025-11-30 00:49:20,350] - step:647/900 train_loss:0.9524 lr:0.0003000000 time/step:183.47s
696
+ [2025-11-30 00:52:23,936] - step:648/900 train_loss:0.9724 lr:0.0003000000 time/step:183.58s
697
+ [2025-11-30 00:55:32,534] - step:649/900 train_loss:1.0075 lr:0.0003000000 time/step:188.59s
698
+ [2025-11-30 00:58:37,981] - step:650/900 train_loss:0.9637 lr:0.0003000000 time/step:185.43s
699
+ [2025-11-30 01:01:43,633] - step:651/900 train_loss:0.9657 lr:0.0003000000 time/step:185.63s
700
+ [2025-11-30 01:04:53,089] - step:652/900 train_loss:0.9597 lr:0.0003000000 time/step:189.45s
701
+ [2025-11-30 01:08:00,352] - step:653/900 train_loss:0.9692 lr:0.0003000000 time/step:187.22s
702
+ [2025-11-30 01:11:07,645] - step:654/900 train_loss:0.9529 lr:0.0003000000 time/step:187.28s
703
+ [2025-11-30 01:14:14,239] - step:655/900 train_loss:0.9482 lr:0.0003000000 time/step:186.59s
704
+ [2025-11-30 01:17:20,123] - step:656/900 train_loss:0.9579 lr:0.0003000000 time/step:185.88s
705
+ [2025-11-30 01:20:25,496] - step:657/900 train_loss:0.9504 lr:0.0003000000 time/step:185.35s
706
+ [2025-11-30 01:23:31,180] - step:658/900 train_loss:0.9749 lr:0.0003000000 time/step:185.66s
707
+ [2025-11-30 01:26:37,555] - step:659/900 train_loss:0.9706 lr:0.0003000000 time/step:186.35s
708
+ [2025-11-30 01:29:43,411] - step:660/900 train_loss:0.9571 lr:0.0003000000 time/step:185.84s
709
+ [2025-11-30 01:32:49,562] - step:661/900 train_loss:0.9464 lr:0.0003000000 time/step:186.14s
710
+ [2025-11-30 01:35:57,969] - step:662/900 train_loss:0.9430 lr:0.0003000000 time/step:188.40s
711
+ [2025-11-30 01:39:04,057] - step:663/900 train_loss:0.9606 lr:0.0003000000 time/step:186.06s
712
+ [2025-11-30 01:42:08,918] - step:664/900 train_loss:0.9484 lr:0.0003000000 time/step:184.85s
713
+ [2025-11-30 01:45:15,790] - step:665/900 train_loss:0.9660 lr:0.0003000000 time/step:186.86s
714
+ [2025-11-30 01:48:21,042] - step:666/900 train_loss:0.9715 lr:0.0003000000 time/step:185.22s
715
+ [2025-11-30 01:51:25,399] - step:667/900 train_loss:0.9747 lr:0.0003000000 time/step:184.34s
716
+ [2025-11-30 01:54:31,595] - step:668/900 train_loss:0.9405 lr:0.0003000000 time/step:186.18s
717
+ [2025-11-30 01:57:37,951] - step:669/900 train_loss:0.9562 lr:0.0003000000 time/step:186.34s
718
+ [2025-11-30 02:00:44,059] - step:670/900 train_loss:0.9800 lr:0.0003000000 time/step:186.09s
719
+ [2025-11-30 02:03:49,586] - step:671/900 train_loss:0.9646 lr:0.0003000000 time/step:185.52s
720
+ [2025-11-30 02:06:57,124] - step:672/900 train_loss:0.9656 lr:0.0003000000 time/step:187.53s
721
+ [2025-11-30 02:10:03,956] - step:673/900 train_loss:0.9544 lr:0.0003000000 time/step:186.80s
722
+ [2025-11-30 02:13:09,941] - step:674/900 train_loss:0.9604 lr:0.0003000000 time/step:185.98s
723
+ [2025-11-30 02:16:17,892] - step:675/900 train_loss:0.9639 lr:0.0003000000 time/step:187.95s
724
+ [2025-11-30 02:19:23,974] - step:676/900 train_loss:0.9455 lr:0.0003000000 time/step:186.05s
725
+ [2025-11-30 02:22:30,221] - step:677/900 train_loss:0.9509 lr:0.0003000000 time/step:186.20s
726
+ [2025-11-30 02:25:37,961] - step:678/900 train_loss:0.9363 lr:0.0003000000 time/step:187.73s
727
+ [2025-11-30 02:28:44,267] - step:679/900 train_loss:0.9520 lr:0.0003000000 time/step:186.29s
728
+ [2025-11-30 02:31:50,617] - step:680/900 train_loss:0.9565 lr:0.0003000000 time/step:186.34s
729
+ [2025-11-30 02:34:58,672] - step:681/900 train_loss:0.9727 lr:0.0003000000 time/step:188.04s
730
+ [2025-11-30 02:38:05,140] - step:682/900 train_loss:0.9563 lr:0.0003000000 time/step:186.46s
731
+ [2025-11-30 02:41:09,992] - step:683/900 train_loss:0.9809 lr:0.0003000000 time/step:184.79s
732
+ [2025-11-30 02:44:15,338] - step:684/900 train_loss:0.9526 lr:0.0003000000 time/step:185.34s
733
+ [2025-11-30 02:47:21,385] - step:685/900 train_loss:0.9675 lr:0.0003000000 time/step:186.04s
734
+ [2025-11-30 02:50:25,872] - step:686/900 train_loss:0.9466 lr:0.0003000000 time/step:184.44s
735
+ [2025-11-30 02:53:31,333] - step:687/900 train_loss:0.9575 lr:0.0003000000 time/step:185.43s
736
+ [2025-11-30 02:56:38,782] - step:688/900 train_loss:0.9673 lr:0.0003000000 time/step:187.43s
737
+ [2025-11-30 02:59:44,702] - step:689/900 train_loss:0.9582 lr:0.0003000000 time/step:185.90s
738
+ [2025-11-30 03:02:50,929] - step:690/900 train_loss:0.9581 lr:0.0003000000 time/step:186.22s
739
+ [2025-11-30 03:05:57,119] - step:691/900 train_loss:0.9407 lr:0.0003000000 time/step:186.18s
740
+ [2025-11-30 03:09:02,467] - step:692/900 train_loss:0.9567 lr:0.0003000000 time/step:185.33s
741
+ [2025-11-30 03:12:07,335] - step:693/900 train_loss:0.9362 lr:0.0003000000 time/step:184.84s
742
+ [2025-11-30 03:15:14,078] - step:694/900 train_loss:0.9692 lr:0.0003000000 time/step:186.74s
743
+ [2025-11-30 03:18:20,680] - step:695/900 train_loss:0.9288 lr:0.0003000000 time/step:186.58s
744
+ [2025-11-30 03:21:26,753] - step:696/900 train_loss:0.9616 lr:0.0003000000 time/step:186.05s
745
+ [2025-11-30 03:24:32,423] - step:697/900 train_loss:0.9203 lr:0.0003000000 time/step:185.66s
746
+ [2025-11-30 03:27:41,442] - step:698/900 train_loss:0.9552 lr:0.0003000000 time/step:189.01s
747
+ [2025-11-30 03:30:45,196] - step:699/900 train_loss:0.9601 lr:0.0003000000 time/step:183.72s
748
+ [2025-11-30 03:33:51,669] - step:700/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@700.pt...
749
+ [2025-11-30 03:33:51,671] - step:700/900 train_loss:0.9515 lr:0.0003000000 time/step:184.76s
750
+ [2025-11-30 03:36:59,452] - step:701/900 train_loss:0.9587 lr:0.0003000000 time/step:187.77s
751
+ [2025-11-30 03:40:05,996] - step:702/900 train_loss:0.9688 lr:0.0003000000 time/step:186.44s
752
+ [2025-11-30 03:43:12,080] - step:703/900 train_loss:0.9386 lr:0.0003000000 time/step:186.06s
753
+ [2025-11-30 03:46:19,965] - step:704/900 train_loss:0.9925 lr:0.0003000000 time/step:187.88s
754
+ [2025-11-30 03:49:25,758] - step:705/900 train_loss:0.9425 lr:0.0003000000 time/step:185.77s
755
+ [2025-11-30 03:52:30,961] - step:706/900 train_loss:0.9720 lr:0.0003000000 time/step:185.19s
756
+ [2025-11-30 03:55:39,418] - step:707/900 train_loss:0.9434 lr:0.0003000000 time/step:188.44s
757
+ [2025-11-30 03:58:47,467] - step:708/900 train_loss:0.9549 lr:0.0003000000 time/step:188.03s
758
+ [2025-11-30 04:01:53,611] - step:709/900 train_loss:0.9511 lr:0.0003000000 time/step:186.12s
759
+ [2025-11-30 04:04:58,923] - step:710/900 train_loss:0.9714 lr:0.0003000000 time/step:185.31s
760
+ [2025-11-30 04:08:06,914] - step:711/900 train_loss:0.9647 lr:0.0003000000 time/step:187.98s
761
+ [2025-11-30 04:11:12,929] - step:712/900 train_loss:0.9789 lr:0.0003000000 time/step:185.96s
762
+ [2025-11-30 04:14:19,154] - step:713/900 train_loss:0.9418 lr:0.0003000000 time/step:186.22s
763
+ [2025-11-30 04:17:27,669] - step:714/900 train_loss:0.9417 lr:0.0003000000 time/step:188.50s
764
+ [2025-11-30 04:20:32,769] - step:715/900 train_loss:0.9507 lr:0.0003000000 time/step:185.08s
765
+ [2025-11-30 04:23:37,756] - step:716/900 train_loss:0.9567 lr:0.0003000000 time/step:184.98s
766
+ [2025-11-30 04:26:45,091] - step:717/900 train_loss:0.9389 lr:0.0003000000 time/step:187.32s
767
+ [2025-11-30 04:29:50,043] - step:718/900 train_loss:0.9477 lr:0.0003000000 time/step:184.87s
768
+ [2025-11-30 04:32:53,971] - step:719/900 train_loss:0.9619 lr:0.0003000000 time/step:183.92s
769
+ [2025-11-30 04:36:00,320] - step:720/900 train_loss:0.9533 lr:0.0003000000 time/step:186.34s
770
+ [2025-11-30 04:39:07,896] - step:721/900 train_loss:0.9650 lr:0.0003000000 time/step:187.55s
771
+ [2025-11-30 04:42:13,833] - step:722/900 train_loss:0.9603 lr:0.0003000000 time/step:185.91s
772
+ [2025-11-30 04:45:20,122] - step:723/900 train_loss:0.9604 lr:0.0003000000 time/step:186.28s
773
+ [2025-11-30 04:48:28,513] - step:724/900 train_loss:0.9635 lr:0.0003000000 time/step:188.38s
774
+ [2025-11-30 04:51:34,485] - step:725/900 train_loss:0.9550 lr:0.0003000000 time/step:185.94s
775
+ [2025-11-30 04:54:40,827] - step:726/900 train_loss:0.9679 lr:0.0003000000 time/step:186.34s
776
+ [2025-11-30 04:57:50,319] - step:727/900 train_loss:0.9607 lr:0.0003000000 time/step:189.46s
777
+ [2025-11-30 05:00:56,724] - step:728/900 train_loss:0.9880 lr:0.0003000000 time/step:186.35s
778
+ [2025-11-30 05:04:02,482] - step:729/900 train_loss:0.9358 lr:0.0003000000 time/step:185.75s
779
+ [2025-11-30 05:07:10,367] - step:730/900 train_loss:0.9521 lr:0.0003000000 time/step:187.88s
780
+ [2025-11-30 05:10:16,528] - step:731/900 train_loss:0.9466 lr:0.0003000000 time/step:186.13s
781
+ [2025-11-30 05:13:22,743] - step:732/900 train_loss:0.9481 lr:0.0003000000 time/step:186.21s
782
+ [2025-11-30 05:16:29,572] - step:733/900 train_loss:0.9613 lr:0.0003000000 time/step:186.81s
783
+ [2025-11-30 05:19:37,538] - step:734/900 train_loss:0.9525 lr:0.0003000000 time/step:187.96s
784
+ [2025-11-30 05:22:40,037] - step:735/900 train_loss:0.9457 lr:0.0003000000 time/step:182.48s
785
+ [2025-11-30 05:25:46,040] - step:736/900 train_loss:0.9572 lr:0.0003000000 time/step:185.97s
786
+ [2025-11-30 05:29:04,236] - step:737/900 train_loss:0.9545 lr:0.0003000000 time/step:196.30s
787
+ [2025-11-30 05:32:09,010] - step:738/900 train_loss:0.9633 lr:0.0003000000 time/step:184.76s
788
+ [2025-11-30 05:35:14,741] - step:739/900 train_loss:0.9598 lr:0.0003000000 time/step:185.72s
789
+ [2025-11-30 05:38:21,357] - step:740/900 train_loss:0.9342 lr:0.0003000000 time/step:186.60s
790
+ [2025-11-30 05:41:26,232] - step:741/900 train_loss:0.9550 lr:0.0003000000 time/step:184.84s
791
+ [2025-11-30 05:44:31,223] - step:742/900 train_loss:0.9696 lr:0.0003000000 time/step:184.98s
792
+ [2025-11-30 05:47:42,033] - step:743/900 train_loss:0.9468 lr:0.0003000000 time/step:190.80s
793
+ [2025-11-30 05:50:47,075] - step:744/900 train_loss:0.9588 lr:0.0003000000 time/step:184.98s
794
+ [2025-11-30 05:53:51,033] - step:745/900 train_loss:0.9498 lr:0.0003000000 time/step:183.94s
795
+ [2025-11-30 05:56:57,016] - step:746/900 train_loss:0.9529 lr:0.0003000000 time/step:185.97s
796
+ [2025-11-30 06:00:01,884] - step:747/900 train_loss:0.9376 lr:0.0003000000 time/step:184.84s
797
+ [2025-11-30 06:03:06,392] - step:748/900 train_loss:0.9415 lr:0.0003000000 time/step:184.49s
798
+ [2025-11-30 06:06:13,954] - step:749/900 train_loss:0.9581 lr:0.0003000000 time/step:187.55s
799
+ [2025-11-30 06:09:18,747] - step:750/900 train_loss:0.9494 lr:0.0003000000 time/step:184.77s
800
+ [2025-11-30 06:12:24,279] - step:751/900 train_loss:0.9586 lr:0.0003000000 time/step:185.52s
801
+ [2025-11-30 06:15:30,040] - step:752/900 train_loss:0.9491 lr:0.0003000000 time/step:185.75s
802
+ [2025-11-30 06:18:37,170] - step:753/900 train_loss:0.9585 lr:0.0003000000 time/step:187.12s
803
+ [2025-11-30 06:21:42,398] - step:754/900 train_loss:0.9441 lr:0.0003000000 time/step:185.20s
804
+ [2025-11-30 06:24:48,671] - step:755/900 train_loss:0.9533 lr:0.0003000000 time/step:186.25s
805
+ [2025-11-30 06:27:56,633] - step:756/900 train_loss:0.9433 lr:0.0003000000 time/step:187.94s
806
+ [2025-11-30 06:31:02,691] - step:757/900 train_loss:0.9368 lr:0.0003000000 time/step:186.01s
807
+ [2025-11-30 06:34:08,615] - step:758/900 train_loss:0.9504 lr:0.0003000000 time/step:185.91s
808
+ [2025-11-30 06:37:15,950] - step:759/900 train_loss:0.9412 lr:0.0003000000 time/step:187.31s
809
+ [2025-11-30 06:40:22,539] - step:760/900 train_loss:0.9330 lr:0.0003000000 time/step:186.51s
810
+ [2025-11-30 06:43:28,876] - step:761/900 train_loss:0.9342 lr:0.0003000000 time/step:186.33s
811
+ [2025-11-30 06:46:36,580] - step:762/900 train_loss:0.9329 lr:0.0003000000 time/step:187.68s
812
+ [2025-11-30 06:49:43,404] - step:763/900 train_loss:0.9465 lr:0.0003000000 time/step:186.79s
813
+ [2025-11-30 06:52:49,437] - step:764/900 train_loss:0.9507 lr:0.0003000000 time/step:186.01s
814
+ [2025-11-30 06:55:55,801] - step:765/900 train_loss:0.9754 lr:0.0003000000 time/step:186.35s
815
+ [2025-11-30 06:59:04,165] - step:766/900 train_loss:0.9323 lr:0.0003000000 time/step:188.35s
816
+ [2025-11-30 07:02:09,611] - step:767/900 train_loss:0.9398 lr:0.0003000000 time/step:185.37s
817
+ [2025-11-30 07:05:15,543] - step:768/900 train_loss:0.9773 lr:0.0003000000 time/step:185.92s
818
+ [2025-11-30 07:08:23,040] - step:769/900 train_loss:0.9300 lr:0.0003000000 time/step:187.49s
819
+ [2025-11-30 07:11:27,989] - step:770/900 train_loss:0.9565 lr:0.0003000000 time/step:184.93s
820
+ [2025-11-30 07:14:34,166] - step:771/900 train_loss:0.9791 lr:0.0003000000 time/step:186.17s
821
+ [2025-11-30 07:17:41,334] - step:772/900 train_loss:0.9323 lr:0.0003000000 time/step:187.15s
822
+ [2025-11-30 07:20:48,245] - step:773/900 train_loss:0.9384 lr:0.0003000000 time/step:186.89s
823
+ [2025-11-30 07:23:55,000] - step:774/900 train_loss:0.9620 lr:0.0003000000 time/step:186.75s
824
+ [2025-11-30 07:27:04,805] - step:775/900 train_loss:0.9535 lr:0.0003000000 time/step:189.79s
825
+ [2025-11-30 07:30:11,933] - step:776/900 train_loss:0.9500 lr:0.0003000000 time/step:187.11s
826
+ [2025-11-30 07:33:18,809] - step:777/900 train_loss:0.9556 lr:0.0003000000 time/step:186.84s
827
+ [2025-11-30 07:36:25,083] - step:778/900 train_loss:0.9280 lr:0.0003000000 time/step:186.27s
828
+ [2025-11-30 07:39:34,387] - step:779/900 train_loss:0.9373 lr:0.0003000000 time/step:189.30s
829
+ [2025-11-30 07:42:40,512] - step:780/900 train_loss:0.9556 lr:0.0003000000 time/step:186.10s
830
+ [2025-11-30 07:45:46,330] - step:781/900 train_loss:0.9568 lr:0.0003000000 time/step:185.80s
831
+ [2025-11-30 07:48:53,453] - step:782/900 train_loss:0.9737 lr:0.0003000000 time/step:187.11s
832
+ [2025-11-30 07:51:59,918] - step:783/900 train_loss:0.9267 lr:0.0003000000 time/step:186.44s
833
+ [2025-11-30 07:55:06,653] - step:784/900 train_loss:0.9683 lr:0.0003000000 time/step:186.73s
834
+ [2025-11-30 07:58:14,271] - step:785/900 train_loss:0.9249 lr:0.0003000000 time/step:187.60s
835
+ [2025-11-30 08:01:21,688] - step:786/900 train_loss:0.9586 lr:0.0003000000 time/step:187.32s
836
+ [2025-11-30 08:04:28,087] - step:787/900 train_loss:0.9470 lr:0.0003000000 time/step:186.39s
837
+ [2025-11-30 08:07:34,899] - step:788/900 train_loss:0.9591 lr:0.0003000000 time/step:186.79s
838
+ [2025-11-30 08:10:41,723] - step:789/900 train_loss:0.9433 lr:0.0003000000 time/step:186.81s
839
+ [2025-11-30 08:13:47,670] - step:790/900 train_loss:0.9496 lr:0.0003000000 time/step:185.92s
840
+ [2025-11-30 08:16:53,831] - step:791/900 train_loss:0.9459 lr:0.0003000000 time/step:186.15s
841
+ [2025-11-30 08:20:01,102] - step:792/900 train_loss:0.9601 lr:0.0003000000 time/step:187.25s
842
+ [2025-11-30 08:23:06,763] - step:793/900 train_loss:0.9408 lr:0.0003000000 time/step:185.64s
843
+ [2025-11-30 08:26:12,187] - step:794/900 train_loss:0.9571 lr:0.0003000000 time/step:185.41s
844
+ [2025-11-30 08:29:18,964] - step:795/900 train_loss:0.9670 lr:0.0003000000 time/step:186.77s
845
+ [2025-11-30 08:32:24,852] - step:796/900 train_loss:0.9432 lr:0.0003000000 time/step:185.86s
846
+ [2025-11-30 08:35:30,345] - step:797/900 train_loss:0.9347 lr:0.0003000000 time/step:185.49s
847
+ [2025-11-30 08:38:37,005] - step:798/900 train_loss:0.9431 lr:0.0003000000 time/step:186.65s
848
+ [2025-11-30 08:41:44,291] - step:799/900 train_loss:0.9548 lr:0.0003000000 time/step:187.24s
849
+ [2025-11-30 08:44:52,246] - step:800/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@800.pt...
850
+ [2025-11-30 08:44:52,247] - step:800/900 train_loss:0.9472 lr:0.0003000000 time/step:186.19s
851
+ [2025-11-30 08:47:59,432] - step:801/900 train_loss:0.9580 lr:0.0003000000 time/step:187.17s
852
+ [2025-11-30 08:51:07,677] - step:802/900 train_loss:0.9347 lr:0.0003000000 time/step:188.21s
853
+ [2025-11-30 08:54:13,610] - step:803/900 train_loss:0.9552 lr:0.0003000000 time/step:185.91s
854
+ [2025-11-30 08:57:20,822] - step:804/900 train_loss:0.9433 lr:0.0003000000 time/step:187.20s
855
+ [2025-11-30 09:00:27,788] - step:805/900 train_loss:0.9725 lr:0.0003000000 time/step:186.95s
856
+ [2025-11-30 09:03:33,451] - step:806/900 train_loss:0.9319 lr:0.0003000000 time/step:185.63s
857
+ [2025-11-30 09:06:38,728] - step:807/900 train_loss:0.9416 lr:0.0003000000 time/step:185.26s
858
+ [2025-11-30 09:09:45,243] - step:808/900 train_loss:0.9305 lr:0.0003000000 time/step:186.49s
859
+ [2025-11-30 09:12:51,277] - step:809/900 train_loss:0.9611 lr:0.0003000000 time/step:186.01s
860
+ [2025-11-30 09:15:57,601] - step:810/900 train_loss:0.9333 lr:0.0003000000 time/step:186.32s
861
+ [2025-11-30 09:19:05,758] - step:811/900 train_loss:0.9224 lr:0.0003000000 time/step:188.14s
862
+ [2025-11-30 09:22:13,002] - step:812/900 train_loss:0.9311 lr:0.0003000000 time/step:187.20s
863
+ [2025-11-30 09:25:19,895] - step:813/900 train_loss:0.9344 lr:0.0003000000 time/step:186.89s
864
+ [2025-11-30 09:28:27,220] - step:814/900 train_loss:0.9558 lr:0.0003000000 time/step:187.31s
865
+ [2025-11-30 09:31:34,977] - step:815/900 train_loss:0.9603 lr:0.0003000000 time/step:187.65s
866
+ [2025-11-30 09:34:41,403] - step:816/900 train_loss:0.9405 lr:0.0003000000 time/step:186.42s
867
+ [2025-11-30 09:37:48,995] - step:817/900 train_loss:0.9620 lr:0.0003000000 time/step:187.58s
868
+ [2025-11-30 09:40:56,468] - step:818/900 train_loss:0.9415 lr:0.0003000000 time/step:187.44s
869
+ [2025-11-30 09:44:02,893] - step:819/900 train_loss:0.9391 lr:0.0003000000 time/step:186.41s
870
+ [2025-11-30 09:47:08,977] - step:820/900 train_loss:0.9551 lr:0.0003000000 time/step:186.08s
871
+ [2025-11-30 09:50:18,823] - step:821/900 train_loss:0.9585 lr:0.0003000000 time/step:189.84s
872
+ [2025-11-30 09:53:24,812] - step:822/900 train_loss:0.9449 lr:0.0003000000 time/step:185.94s
873
+ [2025-11-30 09:56:30,627] - step:823/900 train_loss:0.9446 lr:0.0003000000 time/step:185.81s
874
+ [2025-11-30 09:59:38,002] - step:824/900 train_loss:0.9589 lr:0.0003000000 time/step:187.37s
875
+ [2025-11-30 10:02:45,105] - step:825/900 train_loss:0.9660 lr:0.0003000000 time/step:187.08s
876
+ [2025-11-30 10:05:50,874] - step:826/900 train_loss:0.9403 lr:0.0003000000 time/step:185.75s
877
+ [2025-11-30 10:08:57,938] - step:827/900 train_loss:0.9435 lr:0.0003000000 time/step:187.06s
878
+ [2025-11-30 10:12:06,575] - step:828/900 train_loss:0.9462 lr:0.0003000000 time/step:188.61s
879
+ [2025-11-30 10:15:12,880] - step:829/900 train_loss:0.9383 lr:0.0003000000 time/step:186.30s
880
+ [2025-11-30 10:18:19,887] - step:830/900 train_loss:0.9513 lr:0.0003000000 time/step:187.00s
881
+ [2025-11-30 10:21:26,611] - step:831/900 train_loss:0.9434 lr:0.0003000000 time/step:186.71s
882
+ [2025-11-30 10:24:32,173] - step:832/900 train_loss:0.9277 lr:0.0003000000 time/step:185.55s
883
+ [2025-11-30 10:27:38,320] - step:833/900 train_loss:0.9638 lr:0.0003000000 time/step:186.13s
884
+ [2025-11-30 10:30:45,530] - step:834/900 train_loss:0.9344 lr:0.0003000000 time/step:187.17s
885
+ [2025-11-30 10:33:51,305] - step:835/900 train_loss:0.9318 lr:0.0003000000 time/step:185.76s
886
+ [2025-11-30 10:36:56,815] - step:836/900 train_loss:0.9660 lr:0.0003000000 time/step:185.50s
887
+ [2025-11-30 10:40:03,772] - step:837/900 train_loss:0.9189 lr:0.0003000000 time/step:186.95s
888
+ [2025-11-30 10:43:10,385] - step:838/900 train_loss:0.9294 lr:0.0003000000 time/step:186.60s
889
+ [2025-11-30 10:46:16,106] - step:839/900 train_loss:0.9562 lr:0.0003000000 time/step:185.71s
890
+ [2025-11-30 10:49:23,701] - step:840/900 train_loss:0.9308 lr:0.0003000000 time/step:187.59s
891
+ [2025-11-30 10:52:31,109] - step:841/900 train_loss:0.9446 lr:0.0003000000 time/step:187.37s
892
+ [2025-11-30 10:55:36,766] - step:842/900 train_loss:0.9646 lr:0.0003000000 time/step:185.64s
893
+ [2025-11-30 10:58:43,991] - step:843/900 train_loss:0.9662 lr:0.0003000000 time/step:187.22s
894
+ [2025-11-30 11:01:51,907] - step:844/900 train_loss:0.9557 lr:0.0003000000 time/step:187.91s
895
+ [2025-11-30 11:04:58,972] - step:845/900 train_loss:0.9409 lr:0.0003000000 time/step:187.04s
896
+ [2025-11-30 11:08:05,417] - step:846/900 train_loss:0.9277 lr:0.0003000000 time/step:186.44s
897
+ [2025-11-30 11:11:12,807] - step:847/900 train_loss:0.9310 lr:0.0003000000 time/step:187.37s
898
+ [2025-11-30 11:14:18,599] - step:848/900 train_loss:0.9528 lr:0.0003000000 time/step:185.78s
899
+ [2025-11-30 11:17:24,283] - step:849/900 train_loss:0.9435 lr:0.0003000000 time/step:185.67s
900
+ [2025-11-30 11:20:29,958] - step:850/900 train_loss:0.9328 lr:0.0003000000 time/step:185.67s
901
+ [2025-11-30 11:23:37,238] - step:851/900 train_loss:0.9586 lr:0.0003000000 time/step:187.25s
902
+ [2025-11-30 11:26:43,509] - step:852/900 train_loss:0.9788 lr:0.0003000000 time/step:186.26s
903
+ [2025-11-30 11:29:50,577] - step:853/900 train_loss:0.9598 lr:0.0003000000 time/step:187.04s
904
+ [2025-11-30 11:32:58,592] - step:854/900 train_loss:0.9314 lr:0.0003000000 time/step:187.98s
905
+ [2025-11-30 11:36:05,368] - step:855/900 train_loss:0.9431 lr:0.0003000000 time/step:186.76s
906
+ [2025-11-30 11:39:14,068] - step:856/900 train_loss:0.9402 lr:0.0003000000 time/step:188.69s
907
+ [2025-11-30 11:42:21,149] - step:857/900 train_loss:0.9406 lr:0.0003000000 time/step:187.03s
908
+ [2025-11-30 11:45:27,269] - step:858/900 train_loss:0.9517 lr:0.0003000000 time/step:186.10s
909
+ [2025-11-30 11:48:33,589] - step:859/900 train_loss:0.9288 lr:0.0003000000 time/step:186.29s
910
+ [2025-11-30 11:51:41,526] - step:860/900 train_loss:0.9489 lr:0.0003000000 time/step:187.92s
911
+ [2025-11-30 11:54:48,310] - step:861/900 train_loss:0.9242 lr:0.0003000000 time/step:186.76s
912
+ [2025-11-30 11:57:55,433] - step:862/900 train_loss:0.9465 lr:0.0003000000 time/step:187.12s
913
+ [2025-11-30 12:01:02,214] - step:863/900 train_loss:0.9319 lr:0.0003000000 time/step:186.77s
914
+ [2025-11-30 12:04:10,157] - step:864/900 train_loss:0.9561 lr:0.0003000000 time/step:187.93s
915
+ [2025-11-30 12:07:16,580] - step:865/900 train_loss:0.9531 lr:0.0003000000 time/step:186.41s
916
+ [2025-11-30 12:10:24,225] - step:866/900 train_loss:0.9716 lr:0.0003000000 time/step:187.64s
917
+ [2025-11-30 12:13:32,116] - step:867/900 train_loss:0.9523 lr:0.0003000000 time/step:187.86s
918
+ [2025-11-30 12:16:38,751] - step:868/900 train_loss:0.9485 lr:0.0003000000 time/step:186.63s
919
+ [2025-11-30 12:19:46,482] - step:869/900 train_loss:0.9338 lr:0.0003000000 time/step:187.71s
920
+ [2025-11-30 12:22:55,768] - step:870/900 train_loss:0.9071 lr:0.0003000000 time/step:189.25s
921
+ [2025-11-30 12:26:03,017] - step:871/900 train_loss:0.9349 lr:0.0003000000 time/step:187.24s
922
+ [2025-11-30 12:29:11,277] - step:872/900 train_loss:0.9171 lr:0.0003000000 time/step:188.25s
923
+ [2025-11-30 12:32:17,975] - step:873/900 train_loss:0.9291 lr:0.0003000000 time/step:186.69s
924
+ [2025-11-30 12:35:23,652] - step:874/900 train_loss:0.9496 lr:0.0003000000 time/step:185.66s
925
+ [2025-11-30 12:38:30,634] - step:875/900 train_loss:0.9004 lr:0.0003000000 time/step:186.96s
926
+ [2025-11-30 12:41:36,335] - step:876/900 train_loss:0.9638 lr:0.0003000000 time/step:185.69s
927
+ [2025-11-30 12:44:43,698] - step:877/900 train_loss:0.9303 lr:0.0003000000 time/step:187.35s
928
+ [2025-11-30 12:47:49,883] - step:878/900 train_loss:0.9308 lr:0.0003000000 time/step:186.17s
929
+ [2025-11-30 12:50:57,119] - step:879/900 train_loss:0.9567 lr:0.0003000000 time/step:187.23s
930
+ [2025-11-30 12:54:05,570] - step:880/900 train_loss:0.9294 lr:0.0003000000 time/step:188.35s
931
+ [2025-11-30 12:57:11,908] - step:881/900 train_loss:0.9243 lr:0.0003000000 time/step:186.32s
932
+ [2025-11-30 13:00:18,512] - step:882/900 train_loss:0.9372 lr:0.0003000000 time/step:186.59s
933
+ [2025-11-30 13:03:25,956] - step:883/900 train_loss:0.9677 lr:0.0003000000 time/step:187.41s
934
+ [2025-11-30 13:06:32,030] - step:884/900 train_loss:0.9502 lr:0.0003000000 time/step:186.05s
935
+ [2025-11-30 13:09:38,746] - step:885/900 train_loss:0.9309 lr:0.0003000000 time/step:186.69s
936
+ [2025-11-30 13:12:45,777] - step:886/900 train_loss:0.9468 lr:0.0003000000 time/step:186.96s
937
+ [2025-11-30 13:15:51,552] - step:887/900 train_loss:0.9319 lr:0.0003000000 time/step:185.76s
938
+ [2025-11-30 13:18:58,191] - step:888/900 train_loss:0.9400 lr:0.0003000000 time/step:186.63s
939
+ [2025-11-30 13:22:04,951] - step:889/900 train_loss:0.9518 lr:0.0003000000 time/step:186.75s
940
+ [2025-11-30 13:25:13,370] - step:890/900 train_loss:0.9375 lr:0.0003000000 time/step:188.37s
941
+ [2025-11-30 13:28:19,675] - step:891/900 train_loss:0.9699 lr:0.0003000000 time/step:186.29s
942
+ [2025-11-30 13:31:27,143] - step:892/900 train_loss:0.9479 lr:0.0003000000 time/step:187.46s
943
+ [2025-11-30 13:34:34,338] - step:893/900 train_loss:0.9351 lr:0.0003000000 time/step:187.14s
944
+ [2025-11-30 13:37:40,472] - step:894/900 train_loss:0.9767 lr:0.0003000000 time/step:186.13s
945
+ [2025-11-30 13:40:48,083] - step:895/900 train_loss:0.9475 lr:0.0003000000 time/step:187.60s
946
+ [2025-11-30 13:43:55,818] - step:896/900 train_loss:0.9617 lr:0.0003000000 time/step:187.71s
947
+ [2025-11-30 13:47:01,872] - step:897/900 train_loss:0.9549 lr:0.0003000000 time/step:186.04s
948
+ [2025-11-30 13:50:09,170] - step:898/900 train_loss:0.9324 lr:0.0003000000 time/step:187.29s
949
+ [2025-11-30 13:53:18,377] - step:899/900 train_loss:0.9573 lr:0.0003000000 time/step:189.17s
950
+ [2025-11-30 13:56:26,957] - step:900/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@900.pt...
951
+ [2025-11-30 13:56:26,959] - step:900/900 train_loss:0.9387 lr:0.0003000000 time/step:186.63s
wandb/run-20251128_151948-j8dmy8fe/files/requirements.txt ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ parso==0.8.4
2
+ pydantic_core==2.27.2
3
+ charset-normalizer==3.4.1
4
+ xxhash==3.5.0
5
+ PyYAML==6.0.2
6
+ transformers==4.49.0
7
+ idna==3.10
8
+ nvidia-cudnn-cu12==9.1.0.70
9
+ datasets==4.0.0
10
+ numpy==2.2.3
11
+ hydra-core==1.3.2
12
+ Pygments==2.19.1
13
+ rich==14.0.0
14
+ nvidia-cusolver-cu12==11.6.1.9
15
+ urllib3==2.3.0
16
+ nvidia-cusparselt-cu12==0.6.2
17
+ contourpy==1.3.1
18
+ cycler==0.12.1
19
+ decorator==5.2.1
20
+ psutil==7.0.0
21
+ aiohttp==3.11.13
22
+ einops==0.8.1
23
+ nvidia-cuda-runtime-cu12==12.4.127
24
+ exceptiongroup==1.2.2
25
+ stack-data==0.6.3
26
+ setproctitle==1.3.5
27
+ fsspec==2024.12.0
28
+ tueplots==0.2.0
29
+ pexpect==4.9.0
30
+ gitdb==4.0.12
31
+ fonttools==4.56.0
32
+ ipython==8.35.0
33
+ huggingface-hub==0.29.2
34
+ filelock==3.17.0
35
+ torchvision==0.21.0+cu124
36
+ platformdirs==4.3.6
37
+ peft==0.15.1
38
+ nvidia-cuda-nvrtc-cu12==12.4.127
39
+ wandb==0.19.8
40
+ click==8.1.8
41
+ mpmath==1.3.0
42
+ Jinja2==3.1.6
43
+ scipy==1.14.1
44
+ markdown-it-py==3.0.0
45
+ matplotlib-inline==0.1.7
46
+ wheel==0.45.1
47
+ setuptools==75.8.2
48
+ tqdm==4.67.1
49
+ antlr4-python3-runtime==4.9.3
50
+ deepspeed==0.16.7
51
+ omegaconf==2.3.0
52
+ torchaudio==2.6.0+cu124
53
+ aiosignal==1.3.2
54
+ accelerate==1.6.0
55
+ py-cpuinfo==9.0.0
56
+ pyparsing==3.2.1
57
+ ninja==1.11.1.4
58
+ pandas==2.2.3
59
+ six==1.17.0
60
+ wcwidth==0.2.13
61
+ safetensors==0.5.3
62
+ attrs==25.1.0
63
+ python-dateutil==2.9.0.post0
64
+ nvidia-cufft-cu12==11.2.1.3
65
+ multiprocess==0.70.16
66
+ seaborn==0.13.2
67
+ networkx==3.4.2
68
+ regex==2024.11.6
69
+ nvidia-nvtx-cu12==12.4.127
70
+ tokenizers==0.21.0
71
+ nvidia-curand-cu12==10.3.5.147
72
+ nvidia-nvjitlink-cu12==12.4.127
73
+ MarkupSafe==3.0.2
74
+ triton==3.1.0
75
+ pip==25.0.1
76
+ jedi==0.19.2
77
+ nvidia-cublas-cu12==12.4.5.8
78
+ iniconfig==2.0.0
79
+ pluggy==1.5.0
80
+ langdetect==1.0.9
81
+ pure_eval==0.2.3
82
+ docker-pycreds==0.4.0
83
+ libcirkit==0.2.1
84
+ mdurl==0.1.2
85
+ annotated-types==0.7.0
86
+ sentry-sdk==2.22.0
87
+ executing==2.2.0
88
+ pydantic==2.10.6
89
+ opt_einsum==3.4.0
90
+ pytz==2025.1
91
+ nvidia-cuda-cupti-cu12==12.4.127
92
+ protobuf==5.29.3
93
+ requests==2.32.3
94
+ tomli==2.2.1
95
+ matplotlib==3.10.1
96
+ hjson==3.1.0
97
+ frozenlist==1.5.0
98
+ pillow==11.1.0
99
+ GitPython==3.1.44
100
+ typing_extensions==4.12.2
101
+ pyarrow==19.0.1
102
+ propcache==0.3.0
103
+ prompt_toolkit==3.0.51
104
+ torch==2.6.0+cu124
105
+ async-timeout==5.0.1
106
+ bitsandbytes==0.45.5
107
+ trl==0.16.1
108
+ ptyprocess==0.7.0
109
+ dill==0.3.8
110
+ pytest==8.3.5
111
+ nvidia-nccl-cu12==2.21.5
112
+ sympy==1.13.1
113
+ flash_attn==2.7.4.post1
114
+ certifi==2025.1.31
115
+ nvidia-cusparse-cu12==12.3.1.170
116
+ tzdata==2025.1
117
+ aiohappyeyeballs==2.5.0
118
+ msgpack==1.1.0
119
+ traitlets==5.14.3
120
+ multidict==6.1.0
121
+ packaging==24.2
122
+ kiwisolver==1.4.8
123
+ smmap==5.0.2
124
+ asttokens==3.0.0
125
+ yarl==1.18.3
126
+ graphviz==0.20.3
wandb/run-20251128_151948-j8dmy8fe/files/wandb-metadata.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-57-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.10.16",
4
+ "startedAt": "2025-11-28T15:19:48.470090Z",
5
+ "args": [
6
+ "data=tulu3-llama3-packed",
7
+ "training=tulu3-evabyte-1epoch",
8
+ "lm=llama3-2-3b-byte",
9
+ "model=mtp",
10
+ "adaptor=none",
11
+ "mt_head=linear-evabyte",
12
+ "circuit=btree",
13
+ "circuit.n_token=8",
14
+ "circuit.n_component=32",
15
+ "circuit.n_repetition=1",
16
+ "training.device_batch_size=1",
17
+ "model.model.beta=0",
18
+ "model.model.gamma=0.9",
19
+ "data.val_bin=null",
20
+ "training.learning_rate=0.0003",
21
+ "training.expname=llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1"
22
+ ],
23
+ "program": "-m mtp.train",
24
+ "git": {
25
+ "remote": "git@github.com:PiotrNawrot/nanoGPT.git",
26
+ "commit": "348442692ab18a9196652fdb2c860734ac87a6f4"
27
+ },
28
+ "email": "agrivas@inf.ed.ac.uk",
29
+ "root": "/disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34",
30
+ "host": "scotia04.inf.ed.ac.uk",
31
+ "executable": "/home/agrivas/nanoGPT/.venv/bin/python3",
32
+ "cpu_count": 24,
33
+ "cpu_count_logical": 48,
34
+ "gpu": "NVIDIA L40S",
35
+ "gpu_count": 2,
36
+ "disk": {
37
+ "/": {
38
+ "total": "184643391488",
39
+ "used": "38313848832"
40
+ }
41
+ },
42
+ "memory": {
43
+ "total": "540522954752"
44
+ },
45
+ "cpu": {
46
+ "count": 24,
47
+ "countLogical": 48
48
+ },
49
+ "gpu_nvidia": [
50
+ {
51
+ "name": "NVIDIA L40S",
52
+ "memoryTotal": "48305799168",
53
+ "cudaCores": 18176,
54
+ "architecture": "Ada"
55
+ },
56
+ {
57
+ "name": "NVIDIA L40S",
58
+ "memoryTotal": "48305799168",
59
+ "cudaCores": 18176,
60
+ "architecture": "Ada"
61
+ }
62
+ ],
63
+ "slurm": {
64
+ "cluster_name": "landoniacluster",
65
+ "conf": "/etc/slurm/slurm.conf",
66
+ "cpus_on_node": "12",
67
+ "cpus_per_gpu": "6",
68
+ "gpus_on_node": "2",
69
+ "gtids": "0",
70
+ "job_account": "research-staff",
71
+ "job_cpus_per_node": "12",
72
+ "job_end_time": "1764774672",
73
+ "job_gid": "10000",
74
+ "job_gpus": "1,3",
75
+ "job_id": "2137456",
76
+ "job_name": "slurm.sh",
77
+ "job_nodelist": "scotia04",
78
+ "job_num_nodes": "1",
79
+ "job_partition": "PGR-Standard",
80
+ "job_qos": "normal",
81
+ "job_start_time": "1764342672",
82
+ "job_uid": "1782564",
83
+ "job_user": "agrivas",
84
+ "jobid": "2137456",
85
+ "localid": "0",
86
+ "mem_per_node": "48000",
87
+ "nnodes": "1",
88
+ "nodeid": "0",
89
+ "nodelist": "scotia04",
90
+ "nprocs": "1",
91
+ "ntasks": "1",
92
+ "prio_process": "0",
93
+ "procid": "0",
94
+ "submit_dir": "/home/agrivas",
95
+ "submit_host": "hastings.inf.ed.ac.uk",
96
+ "task_pid": "3728705",
97
+ "tasks_per_node": "1",
98
+ "topology_addr": "scotia04",
99
+ "topology_addr_pattern": "node"
100
+ },
101
+ "cudaVersion": "12.8"
102
+ }
wandb/run-20251128_151948-j8dmy8fe/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_step":899,"_runtime":167798.529724435,"train/ce_loss_at_5":1.5002950429916382,"train/ce_loss_at_1":0.23747298121452332,"_timestamp":1.7645109869604163e+09,"_wandb":{"runtime":167800},"train/ce_loss_at_4":0.9499043822288513,"global_step":900,"train/ce_loss_at_8":1.6754989624023438,"train/ce_loss_at_2":0.41758430004119873,"train/ce_loss_at_3":0.724645733833313,"train/ce_loss_at_7":1.6040127277374268,"train/loss":0.938701868057251,"train/ce_loss_at_6":1.3807265758514404}
wandb/run-20251128_151948-j8dmy8fe/logs/debug-core.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-11-28T15:19:45.880791352Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpio8m7k5j/port-3738330.txt","pid":3738330,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-11-28T15:19:45.884567829Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":3738330}
3
+ {"time":"2025-11-28T15:19:45.884391029Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35531,"Zone":""}}
4
+ {"time":"2025-11-28T15:19:46.014595986Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:41260"}
5
+ {"time":"2025-11-28T15:19:48.474827439Z","level":"INFO","msg":"handleInformInit: received","streamId":"j8dmy8fe","id":"127.0.0.1:41260"}
6
+ {"time":"2025-11-28T15:19:48.699110621Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"j8dmy8fe","id":"127.0.0.1:41260"}
7
+ {"time":"2025-11-30T13:56:28.649444866Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:41260"}
8
+ {"time":"2025-11-30T13:56:28.751426764Z","level":"INFO","msg":"server is shutting down"}
9
+ {"time":"2025-11-30T13:56:28.765190328Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:41260"}
10
+ {"time":"2025-11-30T13:56:28.770779415Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:41260"}
11
+ {"time":"2025-11-30T13:56:30.60432487Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:41260"}
12
+ {"time":"2025-11-30T13:56:30.610007207Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:41260"}
13
+ {"time":"2025-11-30T13:56:30.61641628Z","level":"INFO","msg":"server is closed"}
wandb/run-20251128_151948-j8dmy8fe/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-11-28T15:19:48.477578816Z","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug-core.log"}
2
+ {"time":"2025-11-28T15:19:48.699009069Z","level":"INFO","msg":"created new stream","id":"j8dmy8fe"}
3
+ {"time":"2025-11-28T15:19:48.699097403Z","level":"INFO","msg":"stream: started","id":"j8dmy8fe"}
4
+ {"time":"2025-11-28T15:19:48.699172779Z","level":"INFO","msg":"writer: Do: started","stream_id":"j8dmy8fe"}
5
+ {"time":"2025-11-28T15:19:48.699248275Z","level":"INFO","msg":"handler: started","stream_id":"j8dmy8fe"}
6
+ {"time":"2025-11-28T15:19:48.699271222Z","level":"INFO","msg":"sender: started","stream_id":"j8dmy8fe"}
7
+ {"time":"2025-11-28T15:19:49.160995748Z","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-11-28T20:38:35.352286792Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
9
+ {"time":"2025-11-30T13:56:28.774510068Z","level":"INFO","msg":"stream: closing","id":"j8dmy8fe"}
10
+ {"time":"2025-11-30T13:56:28.784684257Z","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2025-11-30T13:56:29.045992888Z","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2025-11-30T13:56:30.296345529Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-11-30T13:56:30.566218828Z","level":"INFO","msg":"handler: closed","stream_id":"j8dmy8fe"}
14
+ {"time":"2025-11-30T13:56:30.566263739Z","level":"INFO","msg":"writer: Close: closed","stream_id":"j8dmy8fe"}
15
+ {"time":"2025-11-30T13:56:30.573657463Z","level":"INFO","msg":"sender: closed","stream_id":"j8dmy8fe"}
16
+ {"time":"2025-11-30T13:56:30.573732716Z","level":"INFO","msg":"stream: closed","id":"j8dmy8fe"}
wandb/run-20251128_151948-j8dmy8fe/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Configure stats pid to 3738330
3
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from /home/agrivas/.config/wandb/settings
4
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/settings
5
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug.log
7
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug-internal.log
8
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:init():761] calling init triggers
9
+ 2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'compile': True, 'device': 'cuda', 'from_checkpoint': None, 'load_mtp_head_from_model': None, 'name': 'nanogpt', 'training': {'random_seed': 13, 'batch_size': 256, 'device_batch_size': 1, 'sequence_length': 8192, 'num_iterations': 900, 'learning_rate': 0.0003, 'use_scheduler': False, 'save_model': True, 'save_optimizer': True, 'save_model_every': 100, 'val_loss_every': 100, 'val_tokens': 4194304, 'expname': 'llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1'}, 'model': {'name': 'mtp', 'beta': 0.0, 'gamma': 1, 'kl_algorithm': 'full', 'kl_type': 'forward', 'model': {'_target_': 'mtp.models.mtp.MultiTokenLM', 'lm': '${lm.model}', 'circuit': '${circuit.model}', 'mt_head_kwargs': '${mt_head.hyperparameters}', 'init_from_lm_head': True, 'kl_type': '${model.kl_type}', 'kl_algorithm': '${model.kl_algorithm}', 'beta': 0, 'gamma': 0.9}}, 'circuit': {'name': 'btree', 'n_token': 8, 'n_component': 32, 'n_repetition': 1, 'model': {'_target_': 'mtp.models.circuits.CircuitModel', 'vocab_size': 268, 'n_token': 8, 'n_component': 32, 'n_repetition': 1, 'kind': 'btree'}}, 'mt_head': {'name': 'linear-evabyte', 'hyperparameters': {'type': 'evabyte', 'n_embd': 3072, 'transformer_n_head': 24, 'transformer_n_layer': 0, 'expander_type': 'linear', 'expander_n_layer': 1, 'freeze_vocab_unembedding': False, 'share_sum_weights': False, 'contextual_hmm_weights': True, 'init_hmm_identity': True}}, 'adaptor': {'name': 'none', 'hyperparameters': None}, 'lm': {'name': 'llama3-2-3b-byte', 'n_embd': 3072, 'n_head': 24, 'model': {'_target_': 'mtp.models.lm.LM', 'lm': None, 'encoder_only': True, 'from_checkpoint': None, 'from_huggingface': 'benjamin/Llama3-2-3B-IT-Byte', 'adaptor_kwargs': None, 'ref_enc': 'model', 'ref_head': 'lm_head', 'freeze': True}}, 'data': {'name': 'tulu3-llama3', 'train_bin': 'agrv/tulu-v3-sft-llama3-packed-seq-len-8192', 'val_bin': None, 'vocab_size': 268}, 'generate': {'speculative': False}, '_wandb': {}}
11
+ 2025-11-28 15:19:48,445 INFO MainThread:3738330 [wandb_init.py:init():784] starting backend
12
+ 2025-11-28 15:19:48,445 INFO MainThread:3738330 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-11-28 15:19:48,469 INFO MainThread:3738330 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-11-28 15:19:48,469 INFO MainThread:3738330 [wandb_init.py:init():798] backend started and connected
15
+ 2025-11-28 15:19:48,474 INFO MainThread:3738330 [wandb_init.py:init():891] updated telemetry
16
+ 2025-11-28 15:19:48,496 INFO MainThread:3738330 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-11-28 15:19:49,156 INFO MainThread:3738330 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-11-28 15:19:49,910 INFO MainThread:3738330 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-11-28 15:19:49,910 INFO MainThread:3738330 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-11-28 15:19:49,915 INFO MainThread:3738330 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-11-28 15:19:49,915 INFO MainThread:3738330 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-11-28 15:19:49,940 INFO MainThread:3738330 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-11-30 13:56:28,347 INFO MsgRouterThr:3738330 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
wandb/run-20251128_151948-j8dmy8fe/run-j8dmy8fe.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc008877fb114dd4d807eac90e2568c72b6bae32af4a1f66cd94af23416db3f2
3
+ size 14599238