Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- .hydra/config.yaml +84 -0
- .hydra/hydra.yaml +178 -0
- .hydra/overrides.yaml +16 -0
- config.yaml +86 -0
- model@0.pt +3 -0
- model@100.pt +3 -0
- model@200.pt +3 -0
- model@300.pt +3 -0
- model@400.pt +3 -0
- model@500.pt +3 -0
- model@600.pt +3 -0
- model@700.pt +3 -0
- model@800.pt +3 -0
- model@900.pt +3 -0
- nanogpt.log +916 -0
- wandb/debug-internal.log +16 -0
- wandb/debug.log +23 -0
- wandb/run-20251128_151948-j8dmy8fe/files/config.yaml +199 -0
- wandb/run-20251128_151948-j8dmy8fe/files/output.log +951 -0
- wandb/run-20251128_151948-j8dmy8fe/files/requirements.txt +126 -0
- wandb/run-20251128_151948-j8dmy8fe/files/wandb-metadata.json +102 -0
- wandb/run-20251128_151948-j8dmy8fe/files/wandb-summary.json +1 -0
- wandb/run-20251128_151948-j8dmy8fe/logs/debug-core.log +13 -0
- wandb/run-20251128_151948-j8dmy8fe/logs/debug-internal.log +16 -0
- wandb/run-20251128_151948-j8dmy8fe/logs/debug.log +23 -0
- wandb/run-20251128_151948-j8dmy8fe/run-j8dmy8fe.wandb +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
wandb/run-20251128_151948-j8dmy8fe/run-j8dmy8fe.wandb filter=lfs diff=lfs merge=lfs -text
|
.hydra/config.yaml
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compile: true
|
| 2 |
+
device: cuda
|
| 3 |
+
from_checkpoint: null
|
| 4 |
+
load_mtp_head_from_model: null
|
| 5 |
+
name: nanogpt
|
| 6 |
+
training:
|
| 7 |
+
random_seed: 13
|
| 8 |
+
batch_size: 256
|
| 9 |
+
device_batch_size: 1
|
| 10 |
+
sequence_length: 8192
|
| 11 |
+
num_iterations: 900
|
| 12 |
+
learning_rate: 0.0003
|
| 13 |
+
use_scheduler: false
|
| 14 |
+
save_model: true
|
| 15 |
+
save_optimizer: true
|
| 16 |
+
save_model_every: 100
|
| 17 |
+
val_loss_every: 100
|
| 18 |
+
val_tokens: 4194304
|
| 19 |
+
expname: llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
|
| 20 |
+
model:
|
| 21 |
+
name: mtp
|
| 22 |
+
beta: 0.0
|
| 23 |
+
gamma: 1
|
| 24 |
+
kl_algorithm: full
|
| 25 |
+
kl_type: forward
|
| 26 |
+
model:
|
| 27 |
+
_target_: mtp.models.mtp.MultiTokenLM
|
| 28 |
+
lm: ${lm.model}
|
| 29 |
+
circuit: ${circuit.model}
|
| 30 |
+
mt_head_kwargs: ${mt_head.hyperparameters}
|
| 31 |
+
init_from_lm_head: true
|
| 32 |
+
kl_type: ${model.kl_type}
|
| 33 |
+
kl_algorithm: ${model.kl_algorithm}
|
| 34 |
+
beta: 0
|
| 35 |
+
gamma: 0.9
|
| 36 |
+
circuit:
|
| 37 |
+
name: btree
|
| 38 |
+
n_token: 8
|
| 39 |
+
n_component: 32
|
| 40 |
+
n_repetition: 1
|
| 41 |
+
model:
|
| 42 |
+
_target_: mtp.models.circuits.CircuitModel
|
| 43 |
+
vocab_size: ${data.vocab_size}
|
| 44 |
+
n_token: ${circuit.n_token}
|
| 45 |
+
n_component: ${circuit.n_component}
|
| 46 |
+
n_repetition: ${circuit.n_repetition}
|
| 47 |
+
kind: btree
|
| 48 |
+
mt_head:
|
| 49 |
+
name: linear-evabyte
|
| 50 |
+
hyperparameters:
|
| 51 |
+
type: evabyte
|
| 52 |
+
n_embd: ${lm.n_embd}
|
| 53 |
+
transformer_n_head: ${lm.n_head}
|
| 54 |
+
transformer_n_layer: 0
|
| 55 |
+
expander_type: linear
|
| 56 |
+
expander_n_layer: 1
|
| 57 |
+
freeze_vocab_unembedding: false
|
| 58 |
+
share_sum_weights: false
|
| 59 |
+
contextual_hmm_weights: true
|
| 60 |
+
init_hmm_identity: true
|
| 61 |
+
adaptor:
|
| 62 |
+
name: none
|
| 63 |
+
hyperparameters: null
|
| 64 |
+
lm:
|
| 65 |
+
name: llama3-2-3b-byte
|
| 66 |
+
n_embd: 3072
|
| 67 |
+
n_head: 24
|
| 68 |
+
model:
|
| 69 |
+
_target_: mtp.models.lm.LM
|
| 70 |
+
lm: null
|
| 71 |
+
encoder_only: true
|
| 72 |
+
from_checkpoint: null
|
| 73 |
+
from_huggingface: benjamin/Llama3-2-3B-IT-Byte
|
| 74 |
+
adaptor_kwargs: ${adaptor.hyperparameters}
|
| 75 |
+
ref_enc: model
|
| 76 |
+
ref_head: lm_head
|
| 77 |
+
freeze: true
|
| 78 |
+
data:
|
| 79 |
+
name: tulu3-llama3
|
| 80 |
+
train_bin: agrv/tulu-v3-sft-llama3-packed-seq-len-8192
|
| 81 |
+
val_bin: null
|
| 82 |
+
vocab_size: 268
|
| 83 |
+
generate:
|
| 84 |
+
speculative: false
|
.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- data=tulu3-llama3-packed
|
| 116 |
+
- training=tulu3-evabyte-1epoch
|
| 117 |
+
- lm=llama3-2-3b-byte
|
| 118 |
+
- model=mtp
|
| 119 |
+
- adaptor=none
|
| 120 |
+
- mt_head=linear-evabyte
|
| 121 |
+
- circuit=btree
|
| 122 |
+
- circuit.n_token=8
|
| 123 |
+
- circuit.n_component=32
|
| 124 |
+
- circuit.n_repetition=1
|
| 125 |
+
- training.device_batch_size=1
|
| 126 |
+
- model.model.beta=0
|
| 127 |
+
- model.model.gamma=0.9
|
| 128 |
+
- data.val_bin=null
|
| 129 |
+
- training.learning_rate=0.0003
|
| 130 |
+
- training.expname=llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
|
| 131 |
+
job:
|
| 132 |
+
name: ${name}
|
| 133 |
+
chdir: true
|
| 134 |
+
override_dirname: adaptor=none,circuit.n_component=32,circuit.n_repetition=1,circuit.n_token=8,circuit=btree,data.val_bin=null,data=tulu3-llama3-packed,lm=llama3-2-3b-byte,model.model.beta=0,model.model.gamma=0.9,model=mtp,mt_head=linear-evabyte,training.device_batch_size=1,training.expname=llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1,training.learning_rate=0.0003,training=tulu3-evabyte-1epoch
|
| 135 |
+
id: ???
|
| 136 |
+
num: ???
|
| 137 |
+
config_name: config
|
| 138 |
+
env_set: {}
|
| 139 |
+
env_copy: []
|
| 140 |
+
config:
|
| 141 |
+
override_dirname:
|
| 142 |
+
kv_sep: '='
|
| 143 |
+
item_sep: ','
|
| 144 |
+
exclude_keys: []
|
| 145 |
+
runtime:
|
| 146 |
+
version: 1.3.2
|
| 147 |
+
version_base: '1.3'
|
| 148 |
+
cwd: /disk/scratch/agrivas/nanoGPT
|
| 149 |
+
config_sources:
|
| 150 |
+
- path: hydra.conf
|
| 151 |
+
schema: pkg
|
| 152 |
+
provider: hydra
|
| 153 |
+
- path: /disk/scratch/agrivas/nanoGPT/configs
|
| 154 |
+
schema: file
|
| 155 |
+
provider: main
|
| 156 |
+
- path: ''
|
| 157 |
+
schema: structured
|
| 158 |
+
provider: schema
|
| 159 |
+
output_dir: /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34
|
| 160 |
+
choices:
|
| 161 |
+
generate: default
|
| 162 |
+
data: tulu3-llama3-packed
|
| 163 |
+
lm: llama3-2-3b-byte
|
| 164 |
+
adaptor: none
|
| 165 |
+
mt_head: linear-evabyte
|
| 166 |
+
circuit: btree
|
| 167 |
+
model: mtp
|
| 168 |
+
training: tulu3-evabyte-1epoch
|
| 169 |
+
hydra/env: default
|
| 170 |
+
hydra/callbacks: null
|
| 171 |
+
hydra/job_logging: default
|
| 172 |
+
hydra/hydra_logging: default
|
| 173 |
+
hydra/hydra_help: default
|
| 174 |
+
hydra/help: default
|
| 175 |
+
hydra/sweeper: basic
|
| 176 |
+
hydra/launcher: basic
|
| 177 |
+
hydra/output: default
|
| 178 |
+
verbose: false
|
.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- data=tulu3-llama3-packed
|
| 2 |
+
- training=tulu3-evabyte-1epoch
|
| 3 |
+
- lm=llama3-2-3b-byte
|
| 4 |
+
- model=mtp
|
| 5 |
+
- adaptor=none
|
| 6 |
+
- mt_head=linear-evabyte
|
| 7 |
+
- circuit=btree
|
| 8 |
+
- circuit.n_token=8
|
| 9 |
+
- circuit.n_component=32
|
| 10 |
+
- circuit.n_repetition=1
|
| 11 |
+
- training.device_batch_size=1
|
| 12 |
+
- model.model.beta=0
|
| 13 |
+
- model.model.gamma=0.9
|
| 14 |
+
- data.val_bin=null
|
| 15 |
+
- training.learning_rate=0.0003
|
| 16 |
+
- training.expname=llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
|
config.yaml
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compile: true
|
| 2 |
+
device: cuda
|
| 3 |
+
from_checkpoint: null
|
| 4 |
+
load_mtp_head_from_model: null
|
| 5 |
+
name: nanogpt
|
| 6 |
+
training:
|
| 7 |
+
random_seed: 13
|
| 8 |
+
batch_size: 256
|
| 9 |
+
device_batch_size: 1
|
| 10 |
+
sequence_length: 8192
|
| 11 |
+
num_iterations: 900
|
| 12 |
+
learning_rate: 0.0003
|
| 13 |
+
use_scheduler: false
|
| 14 |
+
save_model: true
|
| 15 |
+
save_optimizer: true
|
| 16 |
+
save_model_every: 100
|
| 17 |
+
val_loss_every: 100
|
| 18 |
+
val_tokens: 4194304
|
| 19 |
+
expname: llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
|
| 20 |
+
model:
|
| 21 |
+
name: mtp
|
| 22 |
+
beta: 0.0
|
| 23 |
+
gamma: 1
|
| 24 |
+
kl_algorithm: full
|
| 25 |
+
kl_type: forward
|
| 26 |
+
model:
|
| 27 |
+
_target_: mtp.models.mtp.MultiTokenLM
|
| 28 |
+
lm: ${lm.model}
|
| 29 |
+
circuit: ${circuit.model}
|
| 30 |
+
mt_head_kwargs: ${mt_head.hyperparameters}
|
| 31 |
+
init_from_lm_head: true
|
| 32 |
+
kl_type: ${model.kl_type}
|
| 33 |
+
kl_algorithm: ${model.kl_algorithm}
|
| 34 |
+
beta: 0
|
| 35 |
+
gamma: 0.9
|
| 36 |
+
circuit:
|
| 37 |
+
name: btree
|
| 38 |
+
n_token: 8
|
| 39 |
+
n_component: 32
|
| 40 |
+
n_repetition: 1
|
| 41 |
+
model:
|
| 42 |
+
_target_: mtp.models.circuits.CircuitModel
|
| 43 |
+
vocab_size: 268
|
| 44 |
+
n_token: 8
|
| 45 |
+
n_component: 32
|
| 46 |
+
n_repetition: 1
|
| 47 |
+
kind: btree
|
| 48 |
+
mt_head:
|
| 49 |
+
name: linear-evabyte
|
| 50 |
+
hyperparameters:
|
| 51 |
+
type: evabyte
|
| 52 |
+
n_embd: 3072
|
| 53 |
+
transformer_n_head: 24
|
| 54 |
+
transformer_n_layer: 0
|
| 55 |
+
expander_type: linear
|
| 56 |
+
expander_n_layer: 1
|
| 57 |
+
freeze_vocab_unembedding: false
|
| 58 |
+
share_sum_weights: false
|
| 59 |
+
contextual_hmm_weights: true
|
| 60 |
+
init_hmm_identity: true
|
| 61 |
+
adaptor:
|
| 62 |
+
name: none
|
| 63 |
+
hyperparameters: null
|
| 64 |
+
lm:
|
| 65 |
+
name: llama3-2-3b-byte
|
| 66 |
+
n_embd: 3072
|
| 67 |
+
n_head: 24
|
| 68 |
+
model:
|
| 69 |
+
_target_: mtp.models.lm.LM
|
| 70 |
+
lm: null
|
| 71 |
+
encoder_only: true
|
| 72 |
+
from_checkpoint: null
|
| 73 |
+
from_huggingface: benjamin/Llama3-2-3B-IT-Byte
|
| 74 |
+
adaptor_kwargs: null
|
| 75 |
+
ref_enc: model
|
| 76 |
+
ref_head: lm_head
|
| 77 |
+
freeze: true
|
| 78 |
+
data:
|
| 79 |
+
name: tulu3-llama3
|
| 80 |
+
train_bin: agrv/tulu-v3-sft-llama3-packed-seq-len-8192
|
| 81 |
+
val_bin: null
|
| 82 |
+
vocab_size: 268
|
| 83 |
+
generate:
|
| 84 |
+
speculative: false
|
| 85 |
+
expname: llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
|
| 86 |
+
wandb_run_id: j8dmy8fe
|
model@0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3e5b206d0e1714b7a7de472381e46aa3857906c345e8b28588543d9cbb52222
|
| 3 |
+
size 459529775
|
model@100.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39cc119c551ec1d20a4121d1bb4f5d69254964a902e857c705755c858a5c34e6
|
| 3 |
+
size 1378505919
|
model@200.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83c17e7e509c748c8746f695c00ca2cdca18b970afbfd141f361224d22058b08
|
| 3 |
+
size 1378505919
|
model@300.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7234dc3f26fa576a96d80fc432ffd04d292163521be1ccdec18d79bbec8cfdeb
|
| 3 |
+
size 1378505919
|
model@400.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c32c516042adb0c08905ce13e25da28a2eb2d00ad351e1e87776d003d0057390
|
| 3 |
+
size 1378505919
|
model@500.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ad295cb91cd532f1dccef90022f38c8836eb0093bb15996b02066d99f39503f
|
| 3 |
+
size 1378505919
|
model@600.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fec1ac6807b92be707d1751ba7c69e4bdb164bdef960fafebd27cfe7a16476cb
|
| 3 |
+
size 1378505919
|
model@700.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f11675d16350c4a3e7f5df171057f935fb4fae4e9ae5dcf2583557c2ad2a4857
|
| 3 |
+
size 1378505919
|
model@800.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfd994882e26b15914176e7ae26a8870c383765c7249c55c6ae387226297048e
|
| 3 |
+
size 1378505919
|
model@900.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:182fc1b689cf28a6b695f15e729f7e60c55544117836a7c365f61f9925a506a7
|
| 3 |
+
size 1378505919
|
nanogpt.log
ADDED
|
@@ -0,0 +1,916 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-11-28 15:19:40,898] - Setting up model... compile=True...
|
| 2 |
+
[2025-11-28 15:19:49,942] - Saving config and checkpoints to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34...
|
| 3 |
+
[2025-11-28 15:19:49,942] - Save model: True...
|
| 4 |
+
[2025-11-28 15:19:49,943] - Save optimizer: True...
|
| 5 |
+
[2025-11-28 15:19:49,950] - Training on agrv/tulu-v3-sft-llama3-packed-seq-len-8192...
|
| 6 |
+
[2025-11-28 15:20:29,738] - Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
|
| 7 |
+
[2025-11-28 15:20:31,881] - step:0/900 Saving model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@0.pt...
|
| 8 |
+
[2025-11-28 15:24:47,442] - step:1/900 train_loss:4.6532 lr:0.0003000000 time/step:254.93s
|
| 9 |
+
[2025-11-28 15:27:51,454] - step:2/900 train_loss:4.2966 lr:0.0003000000 time/step:184.01s
|
| 10 |
+
[2025-11-28 15:30:57,835] - step:3/900 train_loss:3.9828 lr:0.0003000000 time/step:186.37s
|
| 11 |
+
[2025-11-28 15:34:08,017] - step:4/900 train_loss:3.6910 lr:0.0003000000 time/step:190.16s
|
| 12 |
+
[2025-11-28 15:37:15,451] - step:5/900 train_loss:3.4752 lr:0.0003000000 time/step:187.40s
|
| 13 |
+
[2025-11-28 15:40:23,150] - step:6/900 train_loss:3.3047 lr:0.0003000000 time/step:187.69s
|
| 14 |
+
[2025-11-28 15:43:30,606] - step:7/900 train_loss:3.1140 lr:0.0003000000 time/step:187.45s
|
| 15 |
+
[2025-11-28 15:46:34,854] - step:8/900 train_loss:2.9731 lr:0.0003000000 time/step:184.24s
|
| 16 |
+
[2025-11-28 15:49:37,899] - step:9/900 train_loss:2.8709 lr:0.0003000000 time/step:183.04s
|
| 17 |
+
[2025-11-28 15:52:42,161] - step:10/900 train_loss:2.7582 lr:0.0003000000 time/step:184.25s
|
| 18 |
+
[2025-11-28 15:55:49,659] - step:11/900 train_loss:2.6474 lr:0.0003000000 time/step:187.49s
|
| 19 |
+
[2025-11-28 15:58:56,730] - step:12/900 train_loss:2.5890 lr:0.0003000000 time/step:187.06s
|
| 20 |
+
[2025-11-28 16:02:04,893] - step:13/900 train_loss:2.5418 lr:0.0003000000 time/step:188.16s
|
| 21 |
+
[2025-11-28 16:05:11,642] - step:14/900 train_loss:2.4586 lr:0.0003000000 time/step:186.74s
|
| 22 |
+
[2025-11-28 16:08:18,599] - step:15/900 train_loss:2.3908 lr:0.0003000000 time/step:186.94s
|
| 23 |
+
[2025-11-28 16:11:24,210] - step:16/900 train_loss:2.3323 lr:0.0003000000 time/step:185.60s
|
| 24 |
+
[2025-11-28 16:14:28,421] - step:17/900 train_loss:2.2802 lr:0.0003000000 time/step:184.20s
|
| 25 |
+
[2025-11-28 16:17:31,315] - step:18/900 train_loss:2.2268 lr:0.0003000000 time/step:182.88s
|
| 26 |
+
[2025-11-28 16:20:33,212] - step:19/900 train_loss:2.2212 lr:0.0003000000 time/step:181.88s
|
| 27 |
+
[2025-11-28 16:23:39,339] - step:20/900 train_loss:2.1965 lr:0.0003000000 time/step:186.12s
|
| 28 |
+
[2025-11-28 16:26:46,366] - step:21/900 train_loss:2.1549 lr:0.0003000000 time/step:187.01s
|
| 29 |
+
[2025-11-28 16:29:53,452] - step:22/900 train_loss:2.0844 lr:0.0003000000 time/step:187.07s
|
| 30 |
+
[2025-11-28 16:33:01,636] - step:23/900 train_loss:2.0673 lr:0.0003000000 time/step:188.18s
|
| 31 |
+
[2025-11-28 16:36:09,091] - step:24/900 train_loss:2.0375 lr:0.0003000000 time/step:187.44s
|
| 32 |
+
[2025-11-28 16:39:16,657] - step:25/900 train_loss:2.0299 lr:0.0003000000 time/step:187.55s
|
| 33 |
+
[2025-11-28 16:42:25,561] - step:26/900 train_loss:1.9910 lr:0.0003000000 time/step:188.90s
|
| 34 |
+
[2025-11-28 16:45:33,283] - step:27/900 train_loss:1.9708 lr:0.0003000000 time/step:187.68s
|
| 35 |
+
[2025-11-28 16:48:40,284] - step:28/900 train_loss:1.9105 lr:0.0003000000 time/step:186.98s
|
| 36 |
+
[2025-11-28 16:51:47,205] - step:29/900 train_loss:1.9014 lr:0.0003000000 time/step:186.92s
|
| 37 |
+
[2025-11-28 16:54:56,592] - step:30/900 train_loss:1.8643 lr:0.0003000000 time/step:189.38s
|
| 38 |
+
[2025-11-28 16:58:04,452] - step:31/900 train_loss:1.8593 lr:0.0003000000 time/step:187.84s
|
| 39 |
+
[2025-11-28 17:01:11,681] - step:32/900 train_loss:1.8733 lr:0.0003000000 time/step:187.21s
|
| 40 |
+
[2025-11-28 17:04:19,862] - step:33/900 train_loss:1.7975 lr:0.0003000000 time/step:188.17s
|
| 41 |
+
[2025-11-28 17:07:27,610] - step:34/900 train_loss:1.8307 lr:0.0003000000 time/step:187.74s
|
| 42 |
+
[2025-11-28 17:10:35,249] - step:35/900 train_loss:1.8018 lr:0.0003000000 time/step:187.63s
|
| 43 |
+
[2025-11-28 17:13:46,950] - step:36/900 train_loss:1.8066 lr:0.0003000000 time/step:191.69s
|
| 44 |
+
[2025-11-28 17:16:53,853] - step:37/900 train_loss:1.7636 lr:0.0003000000 time/step:186.82s
|
| 45 |
+
[2025-11-28 17:20:00,571] - step:38/900 train_loss:1.7714 lr:0.0003000000 time/step:186.54s
|
| 46 |
+
[2025-11-28 17:23:13,248] - step:39/900 train_loss:1.7096 lr:0.0003000000 time/step:192.65s
|
| 47 |
+
[2025-11-28 17:26:19,575] - step:40/900 train_loss:1.7411 lr:0.0003000000 time/step:186.29s
|
| 48 |
+
[2025-11-28 17:29:25,904] - step:41/900 train_loss:1.6913 lr:0.0003000000 time/step:186.27s
|
| 49 |
+
[2025-11-28 17:32:44,977] - step:42/900 train_loss:1.7001 lr:0.0003000000 time/step:199.05s
|
| 50 |
+
[2025-11-28 17:35:51,243] - step:43/900 train_loss:1.6629 lr:0.0003000000 time/step:186.21s
|
| 51 |
+
[2025-11-28 17:38:57,339] - step:44/900 train_loss:1.6610 lr:0.0003000000 time/step:185.79s
|
| 52 |
+
[2025-11-28 17:42:05,062] - step:45/900 train_loss:1.6524 lr:0.0003000000 time/step:187.68s
|
| 53 |
+
[2025-11-28 17:45:20,648] - step:46/900 train_loss:1.6555 lr:0.0003000000 time/step:195.50s
|
| 54 |
+
[2025-11-28 17:48:26,366] - step:47/900 train_loss:1.6223 lr:0.0003000000 time/step:185.70s
|
| 55 |
+
[2025-11-28 17:51:34,666] - step:48/900 train_loss:1.6481 lr:0.0003000000 time/step:188.12s
|
| 56 |
+
[2025-11-28 17:54:51,245] - step:49/900 train_loss:1.6112 lr:0.0003000000 time/step:196.52s
|
| 57 |
+
[2025-11-28 17:57:57,507] - step:50/900 train_loss:1.6013 lr:0.0003000000 time/step:186.19s
|
| 58 |
+
[2025-11-28 18:01:05,674] - step:51/900 train_loss:1.5772 lr:0.0003000000 time/step:187.99s
|
| 59 |
+
[2025-11-28 18:04:21,278] - step:52/900 train_loss:1.5660 lr:0.0003000000 time/step:195.58s
|
| 60 |
+
[2025-11-28 18:07:27,447] - step:53/900 train_loss:1.5702 lr:0.0003000000 time/step:186.11s
|
| 61 |
+
[2025-11-28 18:10:33,793] - step:54/900 train_loss:1.5665 lr:0.0003000000 time/step:186.26s
|
| 62 |
+
[2025-11-28 18:13:53,962] - step:55/900 train_loss:1.5804 lr:0.0003000000 time/step:200.15s
|
| 63 |
+
[2025-11-28 18:17:00,673] - step:56/900 train_loss:1.5645 lr:0.0003000000 time/step:186.66s
|
| 64 |
+
[2025-11-28 18:20:06,961] - step:57/900 train_loss:1.5609 lr:0.0003000000 time/step:186.23s
|
| 65 |
+
[2025-11-28 18:23:24,919] - step:58/900 train_loss:1.5356 lr:0.0003000000 time/step:197.90s
|
| 66 |
+
[2025-11-28 18:26:31,137] - step:59/900 train_loss:1.5277 lr:0.0003000000 time/step:186.18s
|
| 67 |
+
[2025-11-28 18:29:37,442] - step:60/900 train_loss:1.5330 lr:0.0003000000 time/step:186.22s
|
| 68 |
+
[2025-11-28 18:32:45,572] - step:61/900 train_loss:1.5127 lr:0.0003000000 time/step:188.07s
|
| 69 |
+
[2025-11-28 18:36:01,349] - step:62/900 train_loss:1.5127 lr:0.0003000000 time/step:195.75s
|
| 70 |
+
[2025-11-28 18:39:08,044] - step:63/900 train_loss:1.5255 lr:0.0003000000 time/step:186.63s
|
| 71 |
+
[2025-11-28 18:42:16,514] - step:64/900 train_loss:1.4881 lr:0.0003000000 time/step:188.39s
|
| 72 |
+
[2025-11-28 18:45:32,575] - step:65/900 train_loss:1.4746 lr:0.0003000000 time/step:196.00s
|
| 73 |
+
[2025-11-28 18:48:39,543] - step:66/900 train_loss:1.5017 lr:0.0003000000 time/step:186.89s
|
| 74 |
+
[2025-11-28 18:51:47,768] - step:67/900 train_loss:1.4805 lr:0.0003000000 time/step:188.07s
|
| 75 |
+
[2025-11-28 18:55:03,564] - step:68/900 train_loss:1.4929 lr:0.0003000000 time/step:195.75s
|
| 76 |
+
[2025-11-28 18:58:10,293] - step:69/900 train_loss:1.4550 lr:0.0003000000 time/step:186.67s
|
| 77 |
+
[2025-11-28 19:01:16,800] - step:70/900 train_loss:1.4532 lr:0.0003000000 time/step:186.44s
|
| 78 |
+
[2025-11-28 19:04:26,999] - step:71/900 train_loss:1.4520 lr:0.0003000000 time/step:190.18s
|
| 79 |
+
[2025-11-28 19:07:33,259] - step:72/900 train_loss:1.4301 lr:0.0003000000 time/step:186.22s
|
| 80 |
+
[2025-11-28 19:10:39,471] - step:73/900 train_loss:1.4337 lr:0.0003000000 time/step:186.20s
|
| 81 |
+
[2025-11-28 19:13:47,822] - step:74/900 train_loss:1.4296 lr:0.0003000000 time/step:188.33s
|
| 82 |
+
[2025-11-28 19:16:53,884] - step:75/900 train_loss:1.4294 lr:0.0003000000 time/step:186.04s
|
| 83 |
+
[2025-11-28 19:19:59,845] - step:76/900 train_loss:1.4367 lr:0.0003000000 time/step:185.94s
|
| 84 |
+
[2025-11-28 19:23:05,617] - step:77/900 train_loss:1.4359 lr:0.0003000000 time/step:185.76s
|
| 85 |
+
[2025-11-28 19:26:13,471] - step:78/900 train_loss:1.3907 lr:0.0003000000 time/step:187.84s
|
| 86 |
+
[2025-11-28 19:29:19,325] - step:79/900 train_loss:1.4074 lr:0.0003000000 time/step:185.83s
|
| 87 |
+
[2025-11-28 19:32:24,915] - step:80/900 train_loss:1.3818 lr:0.0003000000 time/step:185.57s
|
| 88 |
+
[2025-11-28 19:35:32,821] - step:81/900 train_loss:1.3966 lr:0.0003000000 time/step:187.89s
|
| 89 |
+
[2025-11-28 19:38:38,468] - step:82/900 train_loss:1.3767 lr:0.0003000000 time/step:185.62s
|
| 90 |
+
[2025-11-28 19:41:44,296] - step:83/900 train_loss:1.3772 lr:0.0003000000 time/step:185.82s
|
| 91 |
+
[2025-11-28 19:44:52,361] - step:84/900 train_loss:1.3639 lr:0.0003000000 time/step:188.06s
|
| 92 |
+
[2025-11-28 19:47:59,370] - step:85/900 train_loss:1.3910 lr:0.0003000000 time/step:186.99s
|
| 93 |
+
[2025-11-28 19:51:05,447] - step:86/900 train_loss:1.4013 lr:0.0003000000 time/step:186.07s
|
| 94 |
+
[2025-11-28 19:54:13,032] - step:87/900 train_loss:1.3883 lr:0.0003000000 time/step:187.58s
|
| 95 |
+
[2025-11-28 19:57:19,138] - step:88/900 train_loss:1.3712 lr:0.0003000000 time/step:186.09s
|
| 96 |
+
[2025-11-28 20:00:25,142] - step:89/900 train_loss:1.3749 lr:0.0003000000 time/step:185.98s
|
| 97 |
+
[2025-11-28 20:03:30,825] - step:90/900 train_loss:1.3630 lr:0.0003000000 time/step:185.67s
|
| 98 |
+
[2025-11-28 20:06:38,585] - step:91/900 train_loss:1.3713 lr:0.0003000000 time/step:187.75s
|
| 99 |
+
[2025-11-28 20:09:44,867] - step:92/900 train_loss:1.3503 lr:0.0003000000 time/step:186.27s
|
| 100 |
+
[2025-11-28 20:12:50,830] - step:93/900 train_loss:1.3537 lr:0.0003000000 time/step:185.94s
|
| 101 |
+
[2025-11-28 20:15:58,624] - step:94/900 train_loss:1.3468 lr:0.0003000000 time/step:187.79s
|
| 102 |
+
[2025-11-28 20:19:04,543] - step:95/900 train_loss:1.3603 lr:0.0003000000 time/step:185.91s
|
| 103 |
+
[2025-11-28 20:22:10,848] - step:96/900 train_loss:1.3216 lr:0.0003000000 time/step:186.29s
|
| 104 |
+
[2025-11-28 20:25:17,756] - step:97/900 train_loss:1.3276 lr:0.0003000000 time/step:186.90s
|
| 105 |
+
[2025-11-28 20:28:22,895] - step:98/900 train_loss:1.3128 lr:0.0003000000 time/step:185.09s
|
| 106 |
+
[2025-11-28 20:31:28,093] - step:99/900 train_loss:1.3014 lr:0.0003000000 time/step:185.13s
|
| 107 |
+
[2025-11-28 20:34:37,788] - step:100/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@100.pt...
|
| 108 |
+
[2025-11-28 20:34:37,813] - step:100/900 train_loss:1.3411 lr:0.0003000000 time/step:187.79s
|
| 109 |
+
[2025-11-28 20:37:43,371] - step:101/900 train_loss:1.3414 lr:0.0003000000 time/step:185.55s
|
| 110 |
+
[2025-11-28 20:40:49,246] - step:102/900 train_loss:1.3098 lr:0.0003000000 time/step:185.84s
|
| 111 |
+
[2025-11-28 20:43:55,147] - step:103/900 train_loss:1.3077 lr:0.0003000000 time/step:185.90s
|
| 112 |
+
[2025-11-28 20:47:03,589] - step:104/900 train_loss:1.3283 lr:0.0003000000 time/step:188.43s
|
| 113 |
+
[2025-11-28 20:50:09,456] - step:105/900 train_loss:1.3107 lr:0.0003000000 time/step:185.85s
|
| 114 |
+
[2025-11-28 20:53:15,133] - step:106/900 train_loss:1.3116 lr:0.0003000000 time/step:185.65s
|
| 115 |
+
[2025-11-28 20:56:23,079] - step:107/900 train_loss:1.3076 lr:0.0003000000 time/step:187.94s
|
| 116 |
+
[2025-11-28 20:59:29,309] - step:108/900 train_loss:1.2576 lr:0.0003000000 time/step:186.19s
|
| 117 |
+
[2025-11-28 21:02:34,096] - step:109/900 train_loss:1.3163 lr:0.0003000000 time/step:184.77s
|
| 118 |
+
[2025-11-28 21:05:38,534] - step:110/900 train_loss:1.2836 lr:0.0003000000 time/step:184.43s
|
| 119 |
+
[2025-11-28 21:08:41,909] - step:111/900 train_loss:1.2887 lr:0.0003000000 time/step:183.34s
|
| 120 |
+
[2025-11-28 21:11:44,652] - step:112/900 train_loss:1.2900 lr:0.0003000000 time/step:182.72s
|
| 121 |
+
[2025-11-28 21:14:49,050] - step:113/900 train_loss:1.3032 lr:0.0003000000 time/step:184.39s
|
| 122 |
+
[2025-11-28 21:17:51,714] - step:114/900 train_loss:1.2715 lr:0.0003000000 time/step:182.65s
|
| 123 |
+
[2025-11-28 21:20:54,366] - step:115/900 train_loss:1.2553 lr:0.0003000000 time/step:182.64s
|
| 124 |
+
[2025-11-28 21:23:58,585] - step:116/900 train_loss:1.2608 lr:0.0003000000 time/step:184.21s
|
| 125 |
+
[2025-11-28 21:27:05,711] - step:117/900 train_loss:1.2750 lr:0.0003000000 time/step:187.12s
|
| 126 |
+
[2025-11-28 21:30:10,632] - step:118/900 train_loss:1.2610 lr:0.0003000000 time/step:184.91s
|
| 127 |
+
[2025-11-28 21:33:15,980] - step:119/900 train_loss:1.2728 lr:0.0003000000 time/step:185.32s
|
| 128 |
+
[2025-11-28 21:36:22,993] - step:120/900 train_loss:1.2367 lr:0.0003000000 time/step:187.01s
|
| 129 |
+
[2025-11-28 21:39:27,798] - step:121/900 train_loss:1.2436 lr:0.0003000000 time/step:184.79s
|
| 130 |
+
[2025-11-28 21:42:32,716] - step:122/900 train_loss:1.2680 lr:0.0003000000 time/step:184.90s
|
| 131 |
+
[2025-11-28 21:45:39,837] - step:123/900 train_loss:1.2459 lr:0.0003000000 time/step:187.11s
|
| 132 |
+
[2025-11-28 21:48:44,604] - step:124/900 train_loss:1.2356 lr:0.0003000000 time/step:184.76s
|
| 133 |
+
[2025-11-28 21:51:49,462] - step:125/900 train_loss:1.2116 lr:0.0003000000 time/step:184.84s
|
| 134 |
+
[2025-11-28 21:54:56,152] - step:126/900 train_loss:1.2271 lr:0.0003000000 time/step:186.68s
|
| 135 |
+
[2025-11-28 21:58:02,698] - step:127/900 train_loss:1.2747 lr:0.0003000000 time/step:186.53s
|
| 136 |
+
[2025-11-28 22:01:07,919] - step:128/900 train_loss:1.2662 lr:0.0003000000 time/step:185.21s
|
| 137 |
+
[2025-11-28 22:04:13,473] - step:129/900 train_loss:1.2508 lr:0.0003000000 time/step:185.54s
|
| 138 |
+
[2025-11-28 22:07:19,897] - step:130/900 train_loss:1.2417 lr:0.0003000000 time/step:186.41s
|
| 139 |
+
[2025-11-28 22:10:24,163] - step:131/900 train_loss:1.2469 lr:0.0003000000 time/step:184.26s
|
| 140 |
+
[2025-11-28 22:13:29,588] - step:132/900 train_loss:1.2212 lr:0.0003000000 time/step:185.42s
|
| 141 |
+
[2025-11-28 22:16:36,724] - step:133/900 train_loss:1.2154 lr:0.0003000000 time/step:187.11s
|
| 142 |
+
[2025-11-28 22:19:41,361] - step:134/900 train_loss:1.1905 lr:0.0003000000 time/step:184.62s
|
| 143 |
+
[2025-11-28 22:22:46,426] - step:135/900 train_loss:1.2090 lr:0.0003000000 time/step:185.04s
|
| 144 |
+
[2025-11-28 22:25:53,482] - step:136/900 train_loss:1.2180 lr:0.0003000000 time/step:187.04s
|
| 145 |
+
[2025-11-28 22:28:58,396] - step:137/900 train_loss:1.2309 lr:0.0003000000 time/step:184.90s
|
| 146 |
+
[2025-11-28 22:32:02,953] - step:138/900 train_loss:1.2127 lr:0.0003000000 time/step:184.53s
|
| 147 |
+
[2025-11-28 22:35:08,685] - step:139/900 train_loss:1.2126 lr:0.0003000000 time/step:185.71s
|
| 148 |
+
[2025-11-28 22:38:15,825] - step:140/900 train_loss:1.2117 lr:0.0003000000 time/step:187.09s
|
| 149 |
+
[2025-11-28 22:41:20,366] - step:141/900 train_loss:1.2301 lr:0.0003000000 time/step:184.53s
|
| 150 |
+
[2025-11-28 22:44:24,896] - step:142/900 train_loss:1.2388 lr:0.0003000000 time/step:184.52s
|
| 151 |
+
[2025-11-28 22:47:31,624] - step:143/900 train_loss:1.1987 lr:0.0003000000 time/step:186.71s
|
| 152 |
+
[2025-11-28 22:50:37,358] - step:144/900 train_loss:1.2210 lr:0.0003000000 time/step:185.73s
|
| 153 |
+
[2025-11-28 22:53:43,613] - step:145/900 train_loss:1.2170 lr:0.0003000000 time/step:186.22s
|
| 154 |
+
[2025-11-28 22:57:06,629] - step:146/900 train_loss:1.2236 lr:0.0003000000 time/step:203.01s
|
| 155 |
+
[2025-11-28 23:00:09,814] - step:147/900 train_loss:1.2255 lr:0.0003000000 time/step:183.18s
|
| 156 |
+
[2025-11-28 23:03:14,149] - step:148/900 train_loss:1.1806 lr:0.0003000000 time/step:184.31s
|
| 157 |
+
[2025-11-28 23:06:23,397] - step:149/900 train_loss:1.2233 lr:0.0003000000 time/step:189.23s
|
| 158 |
+
[2025-11-28 23:09:30,162] - step:150/900 train_loss:1.1677 lr:0.0003000000 time/step:186.75s
|
| 159 |
+
[2025-11-28 23:12:34,786] - step:151/900 train_loss:1.2155 lr:0.0003000000 time/step:184.59s
|
| 160 |
+
[2025-11-28 23:15:41,431] - step:152/900 train_loss:1.1948 lr:0.0003000000 time/step:186.63s
|
| 161 |
+
[2025-11-28 23:18:47,806] - step:153/900 train_loss:1.1950 lr:0.0003000000 time/step:186.35s
|
| 162 |
+
[2025-11-28 23:21:52,115] - step:154/900 train_loss:1.2133 lr:0.0003000000 time/step:184.28s
|
| 163 |
+
[2025-11-28 23:24:56,981] - step:155/900 train_loss:1.1862 lr:0.0003000000 time/step:184.85s
|
| 164 |
+
[2025-11-28 23:28:03,290] - step:156/900 train_loss:1.1699 lr:0.0003000000 time/step:186.29s
|
| 165 |
+
[2025-11-28 23:31:07,306] - step:157/900 train_loss:1.1773 lr:0.0003000000 time/step:184.00s
|
| 166 |
+
[2025-11-28 23:34:12,414] - step:158/900 train_loss:1.1680 lr:0.0003000000 time/step:185.10s
|
| 167 |
+
[2025-11-28 23:37:19,900] - step:159/900 train_loss:1.1806 lr:0.0003000000 time/step:187.45s
|
| 168 |
+
[2025-11-28 23:40:24,615] - step:160/900 train_loss:1.1865 lr:0.0003000000 time/step:184.70s
|
| 169 |
+
[2025-11-28 23:43:29,245] - step:161/900 train_loss:1.1872 lr:0.0003000000 time/step:184.61s
|
| 170 |
+
[2025-11-28 23:46:36,811] - step:162/900 train_loss:1.1806 lr:0.0003000000 time/step:187.56s
|
| 171 |
+
[2025-11-28 23:49:41,637] - step:163/900 train_loss:1.1750 lr:0.0003000000 time/step:184.79s
|
| 172 |
+
[2025-11-28 23:52:45,829] - step:164/900 train_loss:1.1828 lr:0.0003000000 time/step:184.16s
|
| 173 |
+
[2025-11-28 23:55:50,721] - step:165/900 train_loss:1.1742 lr:0.0003000000 time/step:184.88s
|
| 174 |
+
[2025-11-28 23:58:57,667] - step:166/900 train_loss:1.1655 lr:0.0003000000 time/step:186.93s
|
| 175 |
+
[2025-11-29 00:02:02,656] - step:167/900 train_loss:1.1631 lr:0.0003000000 time/step:184.97s
|
| 176 |
+
[2025-11-29 00:05:08,306] - step:168/900 train_loss:1.1614 lr:0.0003000000 time/step:185.63s
|
| 177 |
+
[2025-11-29 00:08:15,208] - step:169/900 train_loss:1.1613 lr:0.0003000000 time/step:186.89s
|
| 178 |
+
[2025-11-29 00:11:19,829] - step:170/900 train_loss:1.1623 lr:0.0003000000 time/step:184.60s
|
| 179 |
+
[2025-11-29 00:14:25,137] - step:171/900 train_loss:1.1538 lr:0.0003000000 time/step:185.30s
|
| 180 |
+
[2025-11-29 00:17:32,364] - step:172/900 train_loss:1.1782 lr:0.0003000000 time/step:187.22s
|
| 181 |
+
[2025-11-29 00:20:37,216] - step:173/900 train_loss:1.1596 lr:0.0003000000 time/step:184.84s
|
| 182 |
+
[2025-11-29 00:23:42,361] - step:174/900 train_loss:1.1381 lr:0.0003000000 time/step:185.12s
|
| 183 |
+
[2025-11-29 00:26:49,327] - step:175/900 train_loss:1.1305 lr:0.0003000000 time/step:186.96s
|
| 184 |
+
[2025-11-29 00:29:54,460] - step:176/900 train_loss:1.1603 lr:0.0003000000 time/step:185.12s
|
| 185 |
+
[2025-11-29 00:32:59,491] - step:177/900 train_loss:1.1435 lr:0.0003000000 time/step:185.01s
|
| 186 |
+
[2025-11-29 00:36:04,756] - step:178/900 train_loss:1.1653 lr:0.0003000000 time/step:185.25s
|
| 187 |
+
[2025-11-29 00:39:11,804] - step:179/900 train_loss:1.1443 lr:0.0003000000 time/step:187.04s
|
| 188 |
+
[2025-11-29 00:42:16,834] - step:180/900 train_loss:1.1554 lr:0.0003000000 time/step:185.01s
|
| 189 |
+
[2025-11-29 00:45:22,795] - step:181/900 train_loss:1.1495 lr:0.0003000000 time/step:185.95s
|
| 190 |
+
[2025-11-29 00:48:30,739] - step:182/900 train_loss:1.1251 lr:0.0003000000 time/step:187.94s
|
| 191 |
+
[2025-11-29 00:51:34,795] - step:183/900 train_loss:1.1323 lr:0.0003000000 time/step:184.04s
|
| 192 |
+
[2025-11-29 00:54:39,599] - step:184/900 train_loss:1.1293 lr:0.0003000000 time/step:184.80s
|
| 193 |
+
[2025-11-29 00:57:45,600] - step:185/900 train_loss:1.1500 lr:0.0003000000 time/step:185.99s
|
| 194 |
+
[2025-11-29 01:00:49,413] - step:186/900 train_loss:1.1429 lr:0.0003000000 time/step:183.79s
|
| 195 |
+
[2025-11-29 01:03:54,362] - step:187/900 train_loss:1.1384 lr:0.0003000000 time/step:184.93s
|
| 196 |
+
[2025-11-29 01:07:01,673] - step:188/900 train_loss:1.1665 lr:0.0003000000 time/step:187.31s
|
| 197 |
+
[2025-11-29 01:10:06,793] - step:189/900 train_loss:1.1470 lr:0.0003000000 time/step:185.10s
|
| 198 |
+
[2025-11-29 01:13:11,822] - step:190/900 train_loss:1.1562 lr:0.0003000000 time/step:185.00s
|
| 199 |
+
[2025-11-29 01:16:16,209] - step:191/900 train_loss:1.1811 lr:0.0003000000 time/step:184.37s
|
| 200 |
+
[2025-11-29 01:19:22,340] - step:192/900 train_loss:1.1471 lr:0.0003000000 time/step:186.13s
|
| 201 |
+
[2025-11-29 01:22:26,519] - step:193/900 train_loss:1.1428 lr:0.0003000000 time/step:184.15s
|
| 202 |
+
[2025-11-29 01:25:31,429] - step:194/900 train_loss:1.1208 lr:0.0003000000 time/step:184.89s
|
| 203 |
+
[2025-11-29 01:28:36,974] - step:195/900 train_loss:1.1308 lr:0.0003000000 time/step:185.54s
|
| 204 |
+
[2025-11-29 01:31:40,544] - step:196/900 train_loss:1.1228 lr:0.0003000000 time/step:183.54s
|
| 205 |
+
[2025-11-29 01:34:45,938] - step:197/900 train_loss:1.1161 lr:0.0003000000 time/step:185.38s
|
| 206 |
+
[2025-11-29 01:37:53,156] - step:198/900 train_loss:1.1478 lr:0.0003000000 time/step:187.21s
|
| 207 |
+
[2025-11-29 01:40:58,171] - step:199/900 train_loss:1.1103 lr:0.0003000000 time/step:184.99s
|
| 208 |
+
[2025-11-29 01:44:05,489] - step:200/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@200.pt...
|
| 209 |
+
[2025-11-29 01:44:05,525] - step:200/900 train_loss:1.1274 lr:0.0003000000 time/step:185.55s
|
| 210 |
+
[2025-11-29 01:47:14,488] - step:201/900 train_loss:1.1234 lr:0.0003000000 time/step:188.94s
|
| 211 |
+
[2025-11-29 01:50:21,411] - step:202/900 train_loss:1.1199 lr:0.0003000000 time/step:186.91s
|
| 212 |
+
[2025-11-29 01:53:26,628] - step:203/900 train_loss:1.0972 lr:0.0003000000 time/step:185.20s
|
| 213 |
+
[2025-11-29 01:56:32,570] - step:204/900 train_loss:1.1371 lr:0.0003000000 time/step:185.92s
|
| 214 |
+
[2025-11-29 01:59:41,140] - step:205/900 train_loss:1.1408 lr:0.0003000000 time/step:188.56s
|
| 215 |
+
[2025-11-29 02:02:45,379] - step:206/900 train_loss:1.0997 lr:0.0003000000 time/step:184.22s
|
| 216 |
+
[2025-11-29 02:05:50,066] - step:207/900 train_loss:1.1332 lr:0.0003000000 time/step:184.67s
|
| 217 |
+
[2025-11-29 02:08:56,311] - step:208/900 train_loss:1.1209 lr:0.0003000000 time/step:186.24s
|
| 218 |
+
[2025-11-29 02:12:00,567] - step:209/900 train_loss:1.0919 lr:0.0003000000 time/step:184.22s
|
| 219 |
+
[2025-11-29 02:15:04,792] - step:210/900 train_loss:1.1005 lr:0.0003000000 time/step:184.22s
|
| 220 |
+
[2025-11-29 02:18:10,654] - step:211/900 train_loss:1.1036 lr:0.0003000000 time/step:185.86s
|
| 221 |
+
[2025-11-29 02:21:14,585] - step:212/900 train_loss:1.1229 lr:0.0003000000 time/step:183.92s
|
| 222 |
+
[2025-11-29 02:24:19,368] - step:213/900 train_loss:1.1051 lr:0.0003000000 time/step:184.77s
|
| 223 |
+
[2025-11-29 02:27:26,145] - step:214/900 train_loss:1.1085 lr:0.0003000000 time/step:186.77s
|
| 224 |
+
[2025-11-29 02:30:30,712] - step:215/900 train_loss:1.0930 lr:0.0003000000 time/step:184.56s
|
| 225 |
+
[2025-11-29 02:33:34,774] - step:216/900 train_loss:1.0977 lr:0.0003000000 time/step:184.05s
|
| 226 |
+
[2025-11-29 02:36:40,292] - step:217/900 train_loss:1.1187 lr:0.0003000000 time/step:185.51s
|
| 227 |
+
[2025-11-29 02:39:49,043] - step:218/900 train_loss:1.0909 lr:0.0003000000 time/step:188.73s
|
| 228 |
+
[2025-11-29 02:42:54,991] - step:219/900 train_loss:1.1056 lr:0.0003000000 time/step:185.90s
|
| 229 |
+
[2025-11-29 02:46:00,394] - step:220/900 train_loss:1.1048 lr:0.0003000000 time/step:185.40s
|
| 230 |
+
[2025-11-29 02:49:07,579] - step:221/900 train_loss:1.1078 lr:0.0003000000 time/step:187.17s
|
| 231 |
+
[2025-11-29 02:52:12,146] - step:222/900 train_loss:1.1114 lr:0.0003000000 time/step:184.54s
|
| 232 |
+
[2025-11-29 02:55:16,480] - step:223/900 train_loss:1.1062 lr:0.0003000000 time/step:184.32s
|
| 233 |
+
[2025-11-29 02:58:22,768] - step:224/900 train_loss:1.1142 lr:0.0003000000 time/step:186.28s
|
| 234 |
+
[2025-11-29 03:01:26,953] - step:225/900 train_loss:1.0961 lr:0.0003000000 time/step:184.17s
|
| 235 |
+
[2025-11-29 03:04:31,749] - step:226/900 train_loss:1.0917 lr:0.0003000000 time/step:184.78s
|
| 236 |
+
[2025-11-29 03:07:38,529] - step:227/900 train_loss:1.0934 lr:0.0003000000 time/step:186.77s
|
| 237 |
+
[2025-11-29 03:10:43,271] - step:228/900 train_loss:1.1069 lr:0.0003000000 time/step:184.70s
|
| 238 |
+
[2025-11-29 03:13:48,167] - step:229/900 train_loss:1.0734 lr:0.0003000000 time/step:184.88s
|
| 239 |
+
[2025-11-29 03:16:52,812] - step:230/900 train_loss:1.0957 lr:0.0003000000 time/step:184.63s
|
| 240 |
+
[2025-11-29 03:19:58,801] - step:231/900 train_loss:1.0775 lr:0.0003000000 time/step:185.98s
|
| 241 |
+
[2025-11-29 03:23:02,987] - step:232/900 train_loss:1.0926 lr:0.0003000000 time/step:184.16s
|
| 242 |
+
[2025-11-29 03:26:08,180] - step:233/900 train_loss:1.1314 lr:0.0003000000 time/step:185.19s
|
| 243 |
+
[2025-11-29 03:29:14,462] - step:234/900 train_loss:1.0868 lr:0.0003000000 time/step:186.28s
|
| 244 |
+
[2025-11-29 03:32:19,081] - step:235/900 train_loss:1.0808 lr:0.0003000000 time/step:184.59s
|
| 245 |
+
[2025-11-29 03:35:24,243] - step:236/900 train_loss:1.0749 lr:0.0003000000 time/step:185.16s
|
| 246 |
+
[2025-11-29 03:38:31,254] - step:237/900 train_loss:1.1269 lr:0.0003000000 time/step:187.01s
|
| 247 |
+
[2025-11-29 03:41:35,966] - step:238/900 train_loss:1.0924 lr:0.0003000000 time/step:184.69s
|
| 248 |
+
[2025-11-29 03:44:41,260] - step:239/900 train_loss:1.0906 lr:0.0003000000 time/step:185.27s
|
| 249 |
+
[2025-11-29 03:47:49,206] - step:240/900 train_loss:1.0918 lr:0.0003000000 time/step:187.94s
|
| 250 |
+
[2025-11-29 03:50:54,694] - step:241/900 train_loss:1.0946 lr:0.0003000000 time/step:185.46s
|
| 251 |
+
[2025-11-29 03:53:59,535] - step:242/900 train_loss:1.1074 lr:0.0003000000 time/step:184.80s
|
| 252 |
+
[2025-11-29 03:57:04,220] - step:243/900 train_loss:1.0943 lr:0.0003000000 time/step:184.67s
|
| 253 |
+
[2025-11-29 04:00:10,432] - step:244/900 train_loss:1.0711 lr:0.0003000000 time/step:186.21s
|
| 254 |
+
[2025-11-29 04:03:15,729] - step:245/900 train_loss:1.1061 lr:0.0003000000 time/step:185.26s
|
| 255 |
+
[2025-11-29 04:06:20,984] - step:246/900 train_loss:1.0789 lr:0.0003000000 time/step:185.24s
|
| 256 |
+
[2025-11-29 04:09:27,749] - step:247/900 train_loss:1.0778 lr:0.0003000000 time/step:186.76s
|
| 257 |
+
[2025-11-29 04:12:33,149] - step:248/900 train_loss:1.0830 lr:0.0003000000 time/step:185.36s
|
| 258 |
+
[2025-11-29 04:15:37,995] - step:249/900 train_loss:1.0921 lr:0.0003000000 time/step:184.84s
|
| 259 |
+
[2025-11-29 04:18:44,391] - step:250/900 train_loss:1.0980 lr:0.0003000000 time/step:186.39s
|
| 260 |
+
[2025-11-29 04:21:49,000] - step:251/900 train_loss:1.0761 lr:0.0003000000 time/step:184.59s
|
| 261 |
+
[2025-11-29 04:24:54,274] - step:252/900 train_loss:1.0901 lr:0.0003000000 time/step:185.25s
|
| 262 |
+
[2025-11-29 04:28:02,058] - step:253/900 train_loss:1.0735 lr:0.0003000000 time/step:187.78s
|
| 263 |
+
[2025-11-29 04:31:09,878] - step:254/900 train_loss:1.0600 lr:0.0003000000 time/step:187.80s
|
| 264 |
+
[2025-11-29 04:34:17,142] - step:255/900 train_loss:1.0544 lr:0.0003000000 time/step:187.23s
|
| 265 |
+
[2025-11-29 04:37:23,181] - step:256/900 train_loss:1.0961 lr:0.0003000000 time/step:186.03s
|
| 266 |
+
[2025-11-29 04:40:31,175] - step:257/900 train_loss:1.0838 lr:0.0003000000 time/step:187.99s
|
| 267 |
+
[2025-11-29 04:43:37,155] - step:258/900 train_loss:1.1142 lr:0.0003000000 time/step:185.74s
|
| 268 |
+
[2025-11-29 04:46:41,531] - step:259/900 train_loss:1.0784 lr:0.0003000000 time/step:184.36s
|
| 269 |
+
[2025-11-29 04:49:47,139] - step:260/900 train_loss:1.0548 lr:0.0003000000 time/step:185.61s
|
| 270 |
+
[2025-11-29 04:52:51,373] - step:261/900 train_loss:1.0670 lr:0.0003000000 time/step:184.18s
|
| 271 |
+
[2025-11-29 04:55:56,540] - step:262/900 train_loss:1.0790 lr:0.0003000000 time/step:185.16s
|
| 272 |
+
[2025-11-29 04:59:03,662] - step:263/900 train_loss:1.0758 lr:0.0003000000 time/step:187.12s
|
| 273 |
+
[2025-11-29 05:02:08,811] - step:264/900 train_loss:1.0945 lr:0.0003000000 time/step:185.14s
|
| 274 |
+
[2025-11-29 05:05:13,852] - step:265/900 train_loss:1.0733 lr:0.0003000000 time/step:185.03s
|
| 275 |
+
[2025-11-29 05:08:20,825] - step:266/900 train_loss:1.0854 lr:0.0003000000 time/step:186.97s
|
| 276 |
+
[2025-11-29 05:11:25,639] - step:267/900 train_loss:1.0816 lr:0.0003000000 time/step:184.80s
|
| 277 |
+
[2025-11-29 05:14:31,022] - step:268/900 train_loss:1.0670 lr:0.0003000000 time/step:185.35s
|
| 278 |
+
[2025-11-29 05:17:35,585] - step:269/900 train_loss:1.0892 lr:0.0003000000 time/step:184.33s
|
| 279 |
+
[2025-11-29 05:20:42,015] - step:270/900 train_loss:1.0245 lr:0.0003000000 time/step:186.43s
|
| 280 |
+
[2025-11-29 05:23:46,422] - step:271/900 train_loss:1.0735 lr:0.0003000000 time/step:184.37s
|
| 281 |
+
[2025-11-29 05:26:50,452] - step:272/900 train_loss:1.0714 lr:0.0003000000 time/step:184.01s
|
| 282 |
+
[2025-11-29 05:29:56,149] - step:273/900 train_loss:1.0769 lr:0.0003000000 time/step:185.68s
|
| 283 |
+
[2025-11-29 05:32:59,582] - step:274/900 train_loss:1.0265 lr:0.0003000000 time/step:183.40s
|
| 284 |
+
[2025-11-29 05:36:04,909] - step:275/900 train_loss:1.0510 lr:0.0003000000 time/step:185.31s
|
| 285 |
+
[2025-11-29 05:39:12,005] - step:276/900 train_loss:1.0753 lr:0.0003000000 time/step:187.07s
|
| 286 |
+
[2025-11-29 05:42:16,993] - step:277/900 train_loss:1.0582 lr:0.0003000000 time/step:184.93s
|
| 287 |
+
[2025-11-29 05:45:22,003] - step:278/900 train_loss:1.0717 lr:0.0003000000 time/step:185.00s
|
| 288 |
+
[2025-11-29 05:48:28,179] - step:279/900 train_loss:1.0676 lr:0.0003000000 time/step:186.16s
|
| 289 |
+
[2025-11-29 05:51:33,621] - step:280/900 train_loss:1.0595 lr:0.0003000000 time/step:185.43s
|
| 290 |
+
[2025-11-29 05:54:38,325] - step:281/900 train_loss:1.0585 lr:0.0003000000 time/step:184.68s
|
| 291 |
+
[2025-11-29 05:57:43,757] - step:282/900 train_loss:1.0949 lr:0.0003000000 time/step:185.43s
|
| 292 |
+
[2025-11-29 06:00:50,769] - step:283/900 train_loss:1.0682 lr:0.0003000000 time/step:187.01s
|
| 293 |
+
[2025-11-29 06:03:55,483] - step:284/900 train_loss:1.0756 lr:0.0003000000 time/step:184.69s
|
| 294 |
+
[2025-11-29 06:07:00,263] - step:285/900 train_loss:1.0693 lr:0.0003000000 time/step:184.77s
|
| 295 |
+
[2025-11-29 06:10:07,073] - step:286/900 train_loss:1.0734 lr:0.0003000000 time/step:186.81s
|
| 296 |
+
[2025-11-29 06:13:12,527] - step:287/900 train_loss:1.0729 lr:0.0003000000 time/step:185.42s
|
| 297 |
+
[2025-11-29 06:16:17,678] - step:288/900 train_loss:1.0483 lr:0.0003000000 time/step:185.12s
|
| 298 |
+
[2025-11-29 06:19:24,289] - step:289/900 train_loss:1.0590 lr:0.0003000000 time/step:186.60s
|
| 299 |
+
[2025-11-29 06:22:30,122] - step:290/900 train_loss:1.0687 lr:0.0003000000 time/step:185.81s
|
| 300 |
+
[2025-11-29 06:25:35,642] - step:291/900 train_loss:1.0612 lr:0.0003000000 time/step:185.50s
|
| 301 |
+
[2025-11-29 06:28:42,491] - step:292/900 train_loss:1.0357 lr:0.0003000000 time/step:186.85s
|
| 302 |
+
[2025-11-29 06:31:49,725] - step:293/900 train_loss:1.0708 lr:0.0003000000 time/step:187.22s
|
| 303 |
+
[2025-11-29 06:34:55,796] - step:294/900 train_loss:1.0707 lr:0.0003000000 time/step:186.05s
|
| 304 |
+
[2025-11-29 06:38:00,778] - step:295/900 train_loss:1.0776 lr:0.0003000000 time/step:184.98s
|
| 305 |
+
[2025-11-29 06:41:07,189] - step:296/900 train_loss:1.0576 lr:0.0003000000 time/step:186.41s
|
| 306 |
+
[2025-11-29 06:44:11,733] - step:297/900 train_loss:1.0260 lr:0.0003000000 time/step:184.49s
|
| 307 |
+
[2025-11-29 06:47:15,871] - step:298/900 train_loss:1.0749 lr:0.0003000000 time/step:184.12s
|
| 308 |
+
[2025-11-29 06:50:21,808] - step:299/900 train_loss:1.0567 lr:0.0003000000 time/step:185.93s
|
| 309 |
+
[2025-11-29 06:53:27,979] - step:300/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@300.pt...
|
| 310 |
+
[2025-11-29 06:53:27,992] - step:300/900 train_loss:1.0667 lr:0.0003000000 time/step:184.41s
|
| 311 |
+
[2025-11-29 06:56:33,223] - step:301/900 train_loss:1.0596 lr:0.0003000000 time/step:185.22s
|
| 312 |
+
[2025-11-29 06:59:39,570] - step:302/900 train_loss:1.0485 lr:0.0003000000 time/step:186.34s
|
| 313 |
+
[2025-11-29 07:02:44,495] - step:303/900 train_loss:1.0444 lr:0.0003000000 time/step:184.92s
|
| 314 |
+
[2025-11-29 07:05:48,939] - step:304/900 train_loss:1.0787 lr:0.0003000000 time/step:184.42s
|
| 315 |
+
[2025-11-29 07:08:55,065] - step:305/900 train_loss:1.0548 lr:0.0003000000 time/step:186.12s
|
| 316 |
+
[2025-11-29 07:12:03,934] - step:306/900 train_loss:1.0604 lr:0.0003000000 time/step:188.86s
|
| 317 |
+
[2025-11-29 07:15:10,343] - step:307/900 train_loss:1.0368 lr:0.0003000000 time/step:186.37s
|
| 318 |
+
[2025-11-29 07:18:15,509] - step:308/900 train_loss:1.0500 lr:0.0003000000 time/step:185.15s
|
| 319 |
+
[2025-11-29 07:21:22,280] - step:309/900 train_loss:1.0519 lr:0.0003000000 time/step:186.76s
|
| 320 |
+
[2025-11-29 07:24:27,690] - step:310/900 train_loss:1.0396 lr:0.0003000000 time/step:185.39s
|
| 321 |
+
[2025-11-29 07:27:32,558] - step:311/900 train_loss:1.0199 lr:0.0003000000 time/step:184.86s
|
| 322 |
+
[2025-11-29 07:30:39,398] - step:312/900 train_loss:1.0318 lr:0.0003000000 time/step:186.83s
|
| 323 |
+
[2025-11-29 07:33:43,716] - step:313/900 train_loss:1.0245 lr:0.0003000000 time/step:184.27s
|
| 324 |
+
[2025-11-29 07:36:48,934] - step:314/900 train_loss:1.0550 lr:0.0003000000 time/step:185.21s
|
| 325 |
+
[2025-11-29 07:39:55,818] - step:315/900 train_loss:1.0384 lr:0.0003000000 time/step:186.88s
|
| 326 |
+
[2025-11-29 07:43:00,630] - step:316/900 train_loss:1.0352 lr:0.0003000000 time/step:184.79s
|
| 327 |
+
[2025-11-29 07:46:05,765] - step:317/900 train_loss:1.0406 lr:0.0003000000 time/step:185.12s
|
| 328 |
+
[2025-11-29 07:49:10,421] - step:318/900 train_loss:1.0438 lr:0.0003000000 time/step:184.64s
|
| 329 |
+
[2025-11-29 07:52:16,687] - step:319/900 train_loss:1.0463 lr:0.0003000000 time/step:186.26s
|
| 330 |
+
[2025-11-29 07:55:21,390] - step:320/900 train_loss:1.0608 lr:0.0003000000 time/step:184.68s
|
| 331 |
+
[2025-11-29 07:58:26,121] - step:321/900 train_loss:1.0704 lr:0.0003000000 time/step:184.70s
|
| 332 |
+
[2025-11-29 08:01:33,062] - step:322/900 train_loss:1.0459 lr:0.0003000000 time/step:186.94s
|
| 333 |
+
[2025-11-29 08:04:39,120] - step:323/900 train_loss:1.0463 lr:0.0003000000 time/step:185.86s
|
| 334 |
+
[2025-11-29 08:07:44,864] - step:324/900 train_loss:1.0497 lr:0.0003000000 time/step:185.73s
|
| 335 |
+
[2025-11-29 08:10:51,704] - step:325/900 train_loss:1.0295 lr:0.0003000000 time/step:186.82s
|
| 336 |
+
[2025-11-29 08:13:56,466] - step:326/900 train_loss:1.0555 lr:0.0003000000 time/step:184.73s
|
| 337 |
+
[2025-11-29 08:17:01,957] - step:327/900 train_loss:1.0380 lr:0.0003000000 time/step:185.49s
|
| 338 |
+
[2025-11-29 08:20:08,723] - step:328/900 train_loss:1.0256 lr:0.0003000000 time/step:186.75s
|
| 339 |
+
[2025-11-29 08:23:14,378] - step:329/900 train_loss:1.0418 lr:0.0003000000 time/step:185.64s
|
| 340 |
+
[2025-11-29 08:26:20,417] - step:330/900 train_loss:1.0660 lr:0.0003000000 time/step:186.01s
|
| 341 |
+
[2025-11-29 08:29:26,557] - step:331/900 train_loss:1.0481 lr:0.0003000000 time/step:186.12s
|
| 342 |
+
[2025-11-29 08:32:43,853] - step:332/900 train_loss:1.0370 lr:0.0003000000 time/step:197.25s
|
| 343 |
+
[2025-11-29 08:35:47,902] - step:333/900 train_loss:1.0556 lr:0.0003000000 time/step:184.02s
|
| 344 |
+
[2025-11-29 08:38:52,834] - step:334/900 train_loss:1.0512 lr:0.0003000000 time/step:184.93s
|
| 345 |
+
[2025-11-29 08:41:58,814] - step:335/900 train_loss:1.0432 lr:0.0003000000 time/step:185.95s
|
| 346 |
+
[2025-11-29 08:45:02,340] - step:336/900 train_loss:1.0165 lr:0.0003000000 time/step:183.51s
|
| 347 |
+
[2025-11-29 08:48:06,475] - step:337/900 train_loss:1.0600 lr:0.0003000000 time/step:184.12s
|
| 348 |
+
[2025-11-29 08:51:14,075] - step:338/900 train_loss:1.0304 lr:0.0003000000 time/step:187.60s
|
| 349 |
+
[2025-11-29 08:54:18,478] - step:339/900 train_loss:1.0187 lr:0.0003000000 time/step:184.37s
|
| 350 |
+
[2025-11-29 08:57:23,665] - step:340/900 train_loss:1.0326 lr:0.0003000000 time/step:185.18s
|
| 351 |
+
[2025-11-29 09:00:30,230] - step:341/900 train_loss:1.0415 lr:0.0003000000 time/step:186.56s
|
| 352 |
+
[2025-11-29 09:03:35,007] - step:342/900 train_loss:1.0413 lr:0.0003000000 time/step:184.75s
|
| 353 |
+
[2025-11-29 09:06:39,120] - step:343/900 train_loss:1.0377 lr:0.0003000000 time/step:184.10s
|
| 354 |
+
[2025-11-29 09:09:43,682] - step:344/900 train_loss:1.0266 lr:0.0003000000 time/step:184.56s
|
| 355 |
+
[2025-11-29 09:12:50,738] - step:345/900 train_loss:1.0305 lr:0.0003000000 time/step:187.04s
|
| 356 |
+
[2025-11-29 09:15:54,975] - step:346/900 train_loss:1.0238 lr:0.0003000000 time/step:184.22s
|
| 357 |
+
[2025-11-29 09:18:59,184] - step:347/900 train_loss:1.0470 lr:0.0003000000 time/step:184.20s
|
| 358 |
+
[2025-11-29 09:22:05,583] - step:348/900 train_loss:1.0343 lr:0.0003000000 time/step:186.39s
|
| 359 |
+
[2025-11-29 09:25:09,502] - step:349/900 train_loss:1.0429 lr:0.0003000000 time/step:183.90s
|
| 360 |
+
[2025-11-29 09:28:14,785] - step:350/900 train_loss:1.0173 lr:0.0003000000 time/step:185.28s
|
| 361 |
+
[2025-11-29 09:31:22,664] - step:351/900 train_loss:1.0260 lr:0.0003000000 time/step:187.87s
|
| 362 |
+
[2025-11-29 09:34:27,994] - step:352/900 train_loss:1.0412 lr:0.0003000000 time/step:185.27s
|
| 363 |
+
[2025-11-29 09:37:33,386] - step:353/900 train_loss:1.0051 lr:0.0003000000 time/step:185.37s
|
| 364 |
+
[2025-11-29 09:40:39,936] - step:354/900 train_loss:1.0386 lr:0.0003000000 time/step:186.55s
|
| 365 |
+
[2025-11-29 09:43:45,796] - step:355/900 train_loss:1.0317 lr:0.0003000000 time/step:185.85s
|
| 366 |
+
[2025-11-29 09:46:51,082] - step:356/900 train_loss:1.0060 lr:0.0003000000 time/step:185.26s
|
| 367 |
+
[2025-11-29 09:49:56,919] - step:357/900 train_loss:1.0267 lr:0.0003000000 time/step:185.82s
|
| 368 |
+
[2025-11-29 09:53:05,845] - step:358/900 train_loss:1.0586 lr:0.0003000000 time/step:188.92s
|
| 369 |
+
[2025-11-29 09:56:13,021] - step:359/900 train_loss:1.0340 lr:0.0003000000 time/step:187.15s
|
| 370 |
+
[2025-11-29 09:59:19,033] - step:360/900 train_loss:1.0385 lr:0.0003000000 time/step:186.00s
|
| 371 |
+
[2025-11-29 10:02:25,949] - step:361/900 train_loss:1.0036 lr:0.0003000000 time/step:186.84s
|
| 372 |
+
[2025-11-29 10:05:30,167] - step:362/900 train_loss:1.0181 lr:0.0003000000 time/step:184.18s
|
| 373 |
+
[2025-11-29 10:08:34,860] - step:363/900 train_loss:1.0245 lr:0.0003000000 time/step:184.69s
|
| 374 |
+
[2025-11-29 10:11:40,819] - step:364/900 train_loss:1.0310 lr:0.0003000000 time/step:185.92s
|
| 375 |
+
[2025-11-29 10:14:44,430] - step:365/900 train_loss:1.0431 lr:0.0003000000 time/step:183.59s
|
| 376 |
+
[2025-11-29 10:17:49,210] - step:366/900 train_loss:1.0010 lr:0.0003000000 time/step:184.77s
|
| 377 |
+
[2025-11-29 10:20:56,812] - step:367/900 train_loss:1.0278 lr:0.0003000000 time/step:187.59s
|
| 378 |
+
[2025-11-29 10:24:03,874] - step:368/900 train_loss:1.0450 lr:0.0003000000 time/step:187.04s
|
| 379 |
+
[2025-11-29 10:27:08,644] - step:369/900 train_loss:1.0187 lr:0.0003000000 time/step:184.76s
|
| 380 |
+
[2025-11-29 10:30:12,932] - step:370/900 train_loss:1.0198 lr:0.0003000000 time/step:184.28s
|
| 381 |
+
[2025-11-29 10:33:19,131] - step:371/900 train_loss:1.0267 lr:0.0003000000 time/step:186.19s
|
| 382 |
+
[2025-11-29 10:36:23,611] - step:372/900 train_loss:1.0050 lr:0.0003000000 time/step:184.44s
|
| 383 |
+
[2025-11-29 10:39:27,504] - step:373/900 train_loss:1.0285 lr:0.0003000000 time/step:183.89s
|
| 384 |
+
[2025-11-29 10:42:34,817] - step:374/900 train_loss:1.0273 lr:0.0003000000 time/step:187.31s
|
| 385 |
+
[2025-11-29 10:45:39,564] - step:375/900 train_loss:1.0304 lr:0.0003000000 time/step:184.73s
|
| 386 |
+
[2025-11-29 10:48:44,710] - step:376/900 train_loss:1.0118 lr:0.0003000000 time/step:185.13s
|
| 387 |
+
[2025-11-29 10:51:52,055] - step:377/900 train_loss:1.0109 lr:0.0003000000 time/step:187.34s
|
| 388 |
+
[2025-11-29 10:54:57,418] - step:378/900 train_loss:1.0240 lr:0.0003000000 time/step:185.34s
|
| 389 |
+
[2025-11-29 10:58:02,656] - step:379/900 train_loss:0.9999 lr:0.0003000000 time/step:185.22s
|
| 390 |
+
[2025-11-29 11:01:08,733] - step:380/900 train_loss:1.0321 lr:0.0003000000 time/step:186.07s
|
| 391 |
+
[2025-11-29 11:04:14,600] - step:381/900 train_loss:1.0227 lr:0.0003000000 time/step:185.85s
|
| 392 |
+
[2025-11-29 11:07:19,868] - step:382/900 train_loss:1.0266 lr:0.0003000000 time/step:185.24s
|
| 393 |
+
[2025-11-29 11:10:25,377] - step:383/900 train_loss:1.0351 lr:0.0003000000 time/step:185.51s
|
| 394 |
+
[2025-11-29 11:13:31,325] - step:384/900 train_loss:1.0345 lr:0.0003000000 time/step:185.94s
|
| 395 |
+
[2025-11-29 11:16:37,117] - step:385/900 train_loss:1.0095 lr:0.0003000000 time/step:185.74s
|
| 396 |
+
[2025-11-29 11:19:42,673] - step:386/900 train_loss:1.0084 lr:0.0003000000 time/step:185.53s
|
| 397 |
+
[2025-11-29 11:22:49,798] - step:387/900 train_loss:1.0363 lr:0.0003000000 time/step:187.11s
|
| 398 |
+
[2025-11-29 11:25:54,407] - step:388/900 train_loss:1.0115 lr:0.0003000000 time/step:184.58s
|
| 399 |
+
[2025-11-29 11:29:01,044] - step:389/900 train_loss:1.0391 lr:0.0003000000 time/step:186.63s
|
| 400 |
+
[2025-11-29 11:32:08,861] - step:390/900 train_loss:1.0325 lr:0.0003000000 time/step:187.81s
|
| 401 |
+
[2025-11-29 11:35:13,748] - step:391/900 train_loss:1.0275 lr:0.0003000000 time/step:184.87s
|
| 402 |
+
[2025-11-29 11:38:19,164] - step:392/900 train_loss:1.0071 lr:0.0003000000 time/step:185.41s
|
| 403 |
+
[2025-11-29 11:41:26,071] - step:393/900 train_loss:1.0140 lr:0.0003000000 time/step:186.89s
|
| 404 |
+
[2025-11-29 11:44:30,887] - step:394/900 train_loss:1.0238 lr:0.0003000000 time/step:184.80s
|
| 405 |
+
[2025-11-29 11:47:36,554] - step:395/900 train_loss:1.0223 lr:0.0003000000 time/step:185.63s
|
| 406 |
+
[2025-11-29 11:50:42,929] - step:396/900 train_loss:1.0248 lr:0.0003000000 time/step:186.36s
|
| 407 |
+
[2025-11-29 11:53:49,516] - step:397/900 train_loss:1.0155 lr:0.0003000000 time/step:186.58s
|
| 408 |
+
[2025-11-29 11:56:55,065] - step:398/900 train_loss:1.0266 lr:0.0003000000 time/step:185.52s
|
| 409 |
+
[2025-11-29 12:00:00,180] - step:399/900 train_loss:0.9997 lr:0.0003000000 time/step:185.11s
|
| 410 |
+
[2025-11-29 12:03:08,327] - step:400/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@400.pt...
|
| 411 |
+
[2025-11-29 12:03:08,330] - step:400/900 train_loss:1.0379 lr:0.0003000000 time/step:186.52s
|
| 412 |
+
[2025-11-29 12:06:13,170] - step:401/900 train_loss:1.0278 lr:0.0003000000 time/step:184.80s
|
| 413 |
+
[2025-11-29 12:09:18,331] - step:402/900 train_loss:0.9898 lr:0.0003000000 time/step:185.15s
|
| 414 |
+
[2025-11-29 12:12:24,948] - step:403/900 train_loss:0.9872 lr:0.0003000000 time/step:186.60s
|
| 415 |
+
[2025-11-29 12:15:30,939] - step:404/900 train_loss:1.0125 lr:0.0003000000 time/step:185.98s
|
| 416 |
+
[2025-11-29 12:18:37,715] - step:405/900 train_loss:1.0320 lr:0.0003000000 time/step:186.75s
|
| 417 |
+
[2025-11-29 12:21:43,437] - step:406/900 train_loss:1.0104 lr:0.0003000000 time/step:185.67s
|
| 418 |
+
[2025-11-29 12:24:46,661] - step:407/900 train_loss:1.0245 lr:0.0003000000 time/step:183.20s
|
| 419 |
+
[2025-11-29 12:27:52,506] - step:408/900 train_loss:1.0147 lr:0.0003000000 time/step:185.84s
|
| 420 |
+
[2025-11-29 12:30:57,358] - step:409/900 train_loss:1.0103 lr:0.0003000000 time/step:184.84s
|
| 421 |
+
[2025-11-29 12:34:03,720] - step:410/900 train_loss:0.9781 lr:0.0003000000 time/step:186.36s
|
| 422 |
+
[2025-11-29 12:37:13,118] - step:411/900 train_loss:0.9906 lr:0.0003000000 time/step:189.35s
|
| 423 |
+
[2025-11-29 12:40:19,894] - step:412/900 train_loss:1.0237 lr:0.0003000000 time/step:186.75s
|
| 424 |
+
[2025-11-29 12:43:25,422] - step:413/900 train_loss:1.0114 lr:0.0003000000 time/step:185.52s
|
| 425 |
+
[2025-11-29 12:46:30,579] - step:414/900 train_loss:1.0147 lr:0.0003000000 time/step:184.99s
|
| 426 |
+
[2025-11-29 12:49:38,876] - step:415/900 train_loss:1.0150 lr:0.0003000000 time/step:188.29s
|
| 427 |
+
[2025-11-29 12:52:43,633] - step:416/900 train_loss:1.0239 lr:0.0003000000 time/step:184.73s
|
| 428 |
+
[2025-11-29 12:55:48,060] - step:417/900 train_loss:1.0036 lr:0.0003000000 time/step:184.39s
|
| 429 |
+
[2025-11-29 12:58:55,753] - step:418/900 train_loss:1.0140 lr:0.0003000000 time/step:187.68s
|
| 430 |
+
[2025-11-29 13:02:00,431] - step:419/900 train_loss:1.0039 lr:0.0003000000 time/step:184.66s
|
| 431 |
+
[2025-11-29 13:05:05,089] - step:420/900 train_loss:1.0203 lr:0.0003000000 time/step:184.64s
|
| 432 |
+
[2025-11-29 13:08:12,316] - step:421/900 train_loss:1.0304 lr:0.0003000000 time/step:187.22s
|
| 433 |
+
[2025-11-29 13:11:17,410] - step:422/900 train_loss:1.0034 lr:0.0003000000 time/step:185.08s
|
| 434 |
+
[2025-11-29 13:14:22,416] - step:423/900 train_loss:1.0279 lr:0.0003000000 time/step:185.00s
|
| 435 |
+
[2025-11-29 13:17:27,732] - step:424/900 train_loss:1.0213 lr:0.0003000000 time/step:185.29s
|
| 436 |
+
[2025-11-29 13:20:34,573] - step:425/900 train_loss:0.9987 lr:0.0003000000 time/step:186.70s
|
| 437 |
+
[2025-11-29 13:23:39,421] - step:426/900 train_loss:0.9673 lr:0.0003000000 time/step:184.84s
|
| 438 |
+
[2025-11-29 13:26:44,176] - step:427/900 train_loss:1.0108 lr:0.0003000000 time/step:184.74s
|
| 439 |
+
[2025-11-29 13:29:52,082] - step:428/900 train_loss:1.0243 lr:0.0003000000 time/step:187.87s
|
| 440 |
+
[2025-11-29 13:32:56,984] - step:429/900 train_loss:0.9843 lr:0.0003000000 time/step:184.88s
|
| 441 |
+
[2025-11-29 13:36:01,659] - step:430/900 train_loss:1.0269 lr:0.0003000000 time/step:184.66s
|
| 442 |
+
[2025-11-29 13:39:09,363] - step:431/900 train_loss:1.0047 lr:0.0003000000 time/step:187.70s
|
| 443 |
+
[2025-11-29 13:42:14,007] - step:432/900 train_loss:0.9957 lr:0.0003000000 time/step:184.63s
|
| 444 |
+
[2025-11-29 13:45:17,936] - step:433/900 train_loss:1.0006 lr:0.0003000000 time/step:183.92s
|
| 445 |
+
[2025-11-29 13:48:23,683] - step:434/900 train_loss:1.0080 lr:0.0003000000 time/step:185.74s
|
| 446 |
+
[2025-11-29 13:51:28,718] - step:435/900 train_loss:1.0033 lr:0.0003000000 time/step:185.01s
|
| 447 |
+
[2025-11-29 13:54:33,479] - step:436/900 train_loss:1.0077 lr:0.0003000000 time/step:184.74s
|
| 448 |
+
[2025-11-29 13:57:38,454] - step:437/900 train_loss:0.9913 lr:0.0003000000 time/step:184.96s
|
| 449 |
+
[2025-11-29 14:00:45,973] - step:438/900 train_loss:1.0221 lr:0.0003000000 time/step:187.50s
|
| 450 |
+
[2025-11-29 14:03:50,970] - step:439/900 train_loss:1.0017 lr:0.0003000000 time/step:184.98s
|
| 451 |
+
[2025-11-29 14:06:56,103] - step:440/900 train_loss:0.9966 lr:0.0003000000 time/step:185.11s
|
| 452 |
+
[2025-11-29 14:10:03,916] - step:441/900 train_loss:1.0023 lr:0.0003000000 time/step:187.81s
|
| 453 |
+
[2025-11-29 14:13:09,854] - step:442/900 train_loss:1.0154 lr:0.0003000000 time/step:185.93s
|
| 454 |
+
[2025-11-29 14:16:13,300] - step:443/900 train_loss:0.9993 lr:0.0003000000 time/step:183.43s
|
| 455 |
+
[2025-11-29 14:19:19,989] - step:444/900 train_loss:1.0085 lr:0.0003000000 time/step:186.68s
|
| 456 |
+
[2025-11-29 14:22:23,752] - step:445/900 train_loss:0.9978 lr:0.0003000000 time/step:183.75s
|
| 457 |
+
[2025-11-29 14:25:27,620] - step:446/900 train_loss:1.0148 lr:0.0003000000 time/step:183.84s
|
| 458 |
+
[2025-11-29 14:28:33,765] - step:447/900 train_loss:0.9874 lr:0.0003000000 time/step:186.14s
|
| 459 |
+
[2025-11-29 14:31:37,881] - step:448/900 train_loss:1.0202 lr:0.0003000000 time/step:184.10s
|
| 460 |
+
[2025-11-29 14:34:41,135] - step:449/900 train_loss:0.9902 lr:0.0003000000 time/step:183.23s
|
| 461 |
+
[2025-11-29 14:37:45,361] - step:450/900 train_loss:1.0036 lr:0.0003000000 time/step:184.22s
|
| 462 |
+
[2025-11-29 14:40:53,203] - step:451/900 train_loss:1.0127 lr:0.0003000000 time/step:187.83s
|
| 463 |
+
[2025-11-29 14:43:58,011] - step:452/900 train_loss:1.0339 lr:0.0003000000 time/step:184.77s
|
| 464 |
+
[2025-11-29 14:47:02,348] - step:453/900 train_loss:0.9934 lr:0.0003000000 time/step:184.30s
|
| 465 |
+
[2025-11-29 14:50:10,497] - step:454/900 train_loss:1.0175 lr:0.0003000000 time/step:188.14s
|
| 466 |
+
[2025-11-29 14:53:14,572] - step:455/900 train_loss:1.0011 lr:0.0003000000 time/step:184.06s
|
| 467 |
+
[2025-11-29 14:56:19,257] - step:456/900 train_loss:1.0329 lr:0.0003000000 time/step:184.66s
|
| 468 |
+
[2025-11-29 14:59:26,311] - step:457/900 train_loss:0.9970 lr:0.0003000000 time/step:187.05s
|
| 469 |
+
[2025-11-29 15:02:31,228] - step:458/900 train_loss:0.9849 lr:0.0003000000 time/step:184.91s
|
| 470 |
+
[2025-11-29 15:05:35,912] - step:459/900 train_loss:1.0443 lr:0.0003000000 time/step:184.67s
|
| 471 |
+
[2025-11-29 15:08:44,234] - step:460/900 train_loss:1.0166 lr:0.0003000000 time/step:188.30s
|
| 472 |
+
[2025-11-29 15:11:49,196] - step:461/900 train_loss:0.9857 lr:0.0003000000 time/step:184.94s
|
| 473 |
+
[2025-11-29 15:14:54,073] - step:462/900 train_loss:0.9887 lr:0.0003000000 time/step:184.87s
|
| 474 |
+
[2025-11-29 15:18:01,015] - step:463/900 train_loss:1.0142 lr:0.0003000000 time/step:186.91s
|
| 475 |
+
[2025-11-29 15:21:10,436] - step:464/900 train_loss:1.0084 lr:0.0003000000 time/step:189.42s
|
| 476 |
+
[2025-11-29 15:24:17,825] - step:465/900 train_loss:1.0079 lr:0.0003000000 time/step:187.37s
|
| 477 |
+
[2025-11-29 15:27:21,991] - step:466/900 train_loss:0.9989 lr:0.0003000000 time/step:184.15s
|
| 478 |
+
[2025-11-29 15:30:29,430] - step:467/900 train_loss:1.0027 lr:0.0003000000 time/step:187.42s
|
| 479 |
+
[2025-11-29 15:33:34,038] - step:468/900 train_loss:0.9864 lr:0.0003000000 time/step:184.56s
|
| 480 |
+
[2025-11-29 15:36:38,606] - step:469/900 train_loss:0.9922 lr:0.0003000000 time/step:184.56s
|
| 481 |
+
[2025-11-29 15:39:50,010] - step:470/900 train_loss:1.0046 lr:0.0003000000 time/step:191.39s
|
| 482 |
+
[2025-11-29 15:42:54,426] - step:471/900 train_loss:0.9947 lr:0.0003000000 time/step:184.39s
|
| 483 |
+
[2025-11-29 15:45:58,386] - step:472/900 train_loss:0.9856 lr:0.0003000000 time/step:183.94s
|
| 484 |
+
[2025-11-29 15:49:06,443] - step:473/900 train_loss:1.0102 lr:0.0003000000 time/step:188.03s
|
| 485 |
+
[2025-11-29 15:52:11,651] - step:474/900 train_loss:0.9815 lr:0.0003000000 time/step:185.17s
|
| 486 |
+
[2025-11-29 15:55:16,024] - step:475/900 train_loss:0.9870 lr:0.0003000000 time/step:184.37s
|
| 487 |
+
[2025-11-29 15:58:20,934] - step:476/900 train_loss:0.9902 lr:0.0003000000 time/step:184.90s
|
| 488 |
+
[2025-11-29 16:01:34,770] - step:477/900 train_loss:1.0044 lr:0.0003000000 time/step:193.83s
|
| 489 |
+
[2025-11-29 16:04:40,969] - step:478/900 train_loss:0.9706 lr:0.0003000000 time/step:186.18s
|
| 490 |
+
[2025-11-29 16:07:46,966] - step:479/900 train_loss:0.9861 lr:0.0003000000 time/step:185.98s
|
| 491 |
+
[2025-11-29 16:11:03,013] - step:480/900 train_loss:1.0035 lr:0.0003000000 time/step:196.03s
|
| 492 |
+
[2025-11-29 16:14:06,891] - step:481/900 train_loss:0.9746 lr:0.0003000000 time/step:183.84s
|
| 493 |
+
[2025-11-29 16:17:10,864] - step:482/900 train_loss:0.9883 lr:0.0003000000 time/step:183.95s
|
| 494 |
+
[2025-11-29 16:20:17,244] - step:483/900 train_loss:1.0245 lr:0.0003000000 time/step:186.37s
|
| 495 |
+
[2025-11-29 16:23:21,291] - step:484/900 train_loss:1.0193 lr:0.0003000000 time/step:184.03s
|
| 496 |
+
[2025-11-29 16:26:24,937] - step:485/900 train_loss:0.9953 lr:0.0003000000 time/step:183.63s
|
| 497 |
+
[2025-11-29 16:29:32,575] - step:486/900 train_loss:0.9787 lr:0.0003000000 time/step:187.63s
|
| 498 |
+
[2025-11-29 16:32:37,230] - step:487/900 train_loss:0.9812 lr:0.0003000000 time/step:184.64s
|
| 499 |
+
[2025-11-29 16:35:41,884] - step:488/900 train_loss:0.9911 lr:0.0003000000 time/step:184.65s
|
| 500 |
+
[2025-11-29 16:38:47,753] - step:489/900 train_loss:0.9665 lr:0.0003000000 time/step:185.84s
|
| 501 |
+
[2025-11-29 16:41:53,739] - step:490/900 train_loss:0.9663 lr:0.0003000000 time/step:185.97s
|
| 502 |
+
[2025-11-29 16:44:58,613] - step:491/900 train_loss:1.0147 lr:0.0003000000 time/step:184.87s
|
| 503 |
+
[2025-11-29 16:48:03,370] - step:492/900 train_loss:1.0107 lr:0.0003000000 time/step:184.74s
|
| 504 |
+
[2025-11-29 16:51:11,045] - step:493/900 train_loss:0.9999 lr:0.0003000000 time/step:187.65s
|
| 505 |
+
[2025-11-29 16:54:15,696] - step:494/900 train_loss:0.9875 lr:0.0003000000 time/step:184.64s
|
| 506 |
+
[2025-11-29 16:57:20,229] - step:495/900 train_loss:0.9990 lr:0.0003000000 time/step:184.53s
|
| 507 |
+
[2025-11-29 17:00:26,562] - step:496/900 train_loss:0.9889 lr:0.0003000000 time/step:186.31s
|
| 508 |
+
[2025-11-29 17:03:30,547] - step:497/900 train_loss:0.9835 lr:0.0003000000 time/step:183.97s
|
| 509 |
+
[2025-11-29 17:06:34,456] - step:498/900 train_loss:1.0062 lr:0.0003000000 time/step:183.89s
|
| 510 |
+
[2025-11-29 17:09:40,945] - step:499/900 train_loss:0.9785 lr:0.0003000000 time/step:186.48s
|
| 511 |
+
[2025-11-29 17:12:49,048] - step:500/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@500.pt...
|
| 512 |
+
[2025-11-29 17:12:49,051] - step:500/900 train_loss:1.0054 lr:0.0003000000 time/step:186.42s
|
| 513 |
+
[2025-11-29 17:15:53,725] - step:501/900 train_loss:0.9961 lr:0.0003000000 time/step:184.67s
|
| 514 |
+
[2025-11-29 17:18:59,020] - step:502/900 train_loss:1.0013 lr:0.0003000000 time/step:185.28s
|
| 515 |
+
[2025-11-29 17:22:06,405] - step:503/900 train_loss:0.9746 lr:0.0003000000 time/step:187.35s
|
| 516 |
+
[2025-11-29 17:25:11,190] - step:504/900 train_loss:0.9977 lr:0.0003000000 time/step:184.76s
|
| 517 |
+
[2025-11-29 17:28:16,344] - step:505/900 train_loss:0.9737 lr:0.0003000000 time/step:185.15s
|
| 518 |
+
[2025-11-29 17:31:24,506] - step:506/900 train_loss:1.0010 lr:0.0003000000 time/step:188.14s
|
| 519 |
+
[2025-11-29 17:34:29,492] - step:507/900 train_loss:0.9852 lr:0.0003000000 time/step:184.96s
|
| 520 |
+
[2025-11-29 17:37:34,522] - step:508/900 train_loss:0.9887 lr:0.0003000000 time/step:185.00s
|
| 521 |
+
[2025-11-29 17:40:41,236] - step:509/900 train_loss:0.9830 lr:0.0003000000 time/step:186.70s
|
| 522 |
+
[2025-11-29 17:43:46,307] - step:510/900 train_loss:0.9844 lr:0.0003000000 time/step:185.05s
|
| 523 |
+
[2025-11-29 17:46:49,527] - step:511/900 train_loss:0.9718 lr:0.0003000000 time/step:183.20s
|
| 524 |
+
[2025-11-29 17:49:52,170] - step:512/900 train_loss:0.9866 lr:0.0003000000 time/step:182.64s
|
| 525 |
+
[2025-11-29 17:52:55,148] - step:513/900 train_loss:1.0106 lr:0.0003000000 time/step:182.96s
|
| 526 |
+
[2025-11-29 17:55:59,252] - step:514/900 train_loss:0.9629 lr:0.0003000000 time/step:184.09s
|
| 527 |
+
[2025-11-29 17:59:04,675] - step:515/900 train_loss:1.0048 lr:0.0003000000 time/step:185.41s
|
| 528 |
+
[2025-11-29 18:02:13,006] - step:516/900 train_loss:0.9964 lr:0.0003000000 time/step:188.32s
|
| 529 |
+
[2025-11-29 18:05:16,855] - step:517/900 train_loss:1.0057 lr:0.0003000000 time/step:183.84s
|
| 530 |
+
[2025-11-29 18:08:20,622] - step:518/900 train_loss:0.9859 lr:0.0003000000 time/step:183.75s
|
| 531 |
+
[2025-11-29 18:11:26,793] - step:519/900 train_loss:0.9714 lr:0.0003000000 time/step:186.16s
|
| 532 |
+
[2025-11-29 18:14:29,889] - step:520/900 train_loss:0.9652 lr:0.0003000000 time/step:183.08s
|
| 533 |
+
[2025-11-29 18:17:33,349] - step:521/900 train_loss:0.9786 lr:0.0003000000 time/step:183.43s
|
| 534 |
+
[2025-11-29 18:20:39,275] - step:522/900 train_loss:0.9721 lr:0.0003000000 time/step:185.92s
|
| 535 |
+
[2025-11-29 18:23:43,034] - step:523/900 train_loss:0.9862 lr:0.0003000000 time/step:183.75s
|
| 536 |
+
[2025-11-29 18:26:46,732] - step:524/900 train_loss:0.9942 lr:0.0003000000 time/step:183.66s
|
| 537 |
+
[2025-11-29 18:29:50,749] - step:525/900 train_loss:0.9850 lr:0.0003000000 time/step:184.01s
|
| 538 |
+
[2025-11-29 18:32:55,277] - step:526/900 train_loss:0.9804 lr:0.0003000000 time/step:184.51s
|
| 539 |
+
[2025-11-29 18:36:00,371] - step:527/900 train_loss:0.9845 lr:0.0003000000 time/step:185.08s
|
| 540 |
+
[2025-11-29 18:39:06,966] - step:528/900 train_loss:0.9832 lr:0.0003000000 time/step:186.57s
|
| 541 |
+
[2025-11-29 18:42:15,798] - step:529/900 train_loss:0.9967 lr:0.0003000000 time/step:188.82s
|
| 542 |
+
[2025-11-29 18:45:22,445] - step:530/900 train_loss:0.9910 lr:0.0003000000 time/step:186.63s
|
| 543 |
+
[2025-11-29 18:48:29,542] - step:531/900 train_loss:0.9714 lr:0.0003000000 time/step:187.07s
|
| 544 |
+
[2025-11-29 18:51:38,501] - step:532/900 train_loss:0.9868 lr:0.0003000000 time/step:188.95s
|
| 545 |
+
[2025-11-29 18:54:45,821] - step:533/900 train_loss:0.9929 lr:0.0003000000 time/step:187.30s
|
| 546 |
+
[2025-11-29 18:57:53,314] - step:534/900 train_loss:0.9879 lr:0.0003000000 time/step:187.47s
|
| 547 |
+
[2025-11-29 19:01:01,583] - step:535/900 train_loss:1.0067 lr:0.0003000000 time/step:188.26s
|
| 548 |
+
[2025-11-29 19:04:09,493] - step:536/900 train_loss:0.9836 lr:0.0003000000 time/step:187.89s
|
| 549 |
+
[2025-11-29 19:07:16,734] - step:537/900 train_loss:0.9868 lr:0.0003000000 time/step:187.21s
|
| 550 |
+
[2025-11-29 19:10:24,993] - step:538/900 train_loss:0.9951 lr:0.0003000000 time/step:188.24s
|
| 551 |
+
[2025-11-29 19:13:31,431] - step:539/900 train_loss:0.9761 lr:0.0003000000 time/step:186.41s
|
| 552 |
+
[2025-11-29 19:16:36,819] - step:540/900 train_loss:0.9742 lr:0.0003000000 time/step:185.38s
|
| 553 |
+
[2025-11-29 19:19:43,799] - step:541/900 train_loss:0.9745 lr:0.0003000000 time/step:186.95s
|
| 554 |
+
[2025-11-29 19:22:52,532] - step:542/900 train_loss:0.9817 lr:0.0003000000 time/step:188.73s
|
| 555 |
+
[2025-11-29 19:25:59,216] - step:543/900 train_loss:0.9777 lr:0.0003000000 time/step:186.67s
|
| 556 |
+
[2025-11-29 19:29:05,849] - step:544/900 train_loss:0.9960 lr:0.0003000000 time/step:186.61s
|
| 557 |
+
[2025-11-29 19:32:14,558] - step:545/900 train_loss:0.9811 lr:0.0003000000 time/step:188.70s
|
| 558 |
+
[2025-11-29 19:35:21,607] - step:546/900 train_loss:0.9882 lr:0.0003000000 time/step:187.02s
|
| 559 |
+
[2025-11-29 19:38:28,518] - step:547/900 train_loss:0.9938 lr:0.0003000000 time/step:186.88s
|
| 560 |
+
[2025-11-29 19:41:37,225] - step:548/900 train_loss:0.9407 lr:0.0003000000 time/step:188.70s
|
| 561 |
+
[2025-11-29 19:44:44,063] - step:549/900 train_loss:0.9774 lr:0.0003000000 time/step:186.81s
|
| 562 |
+
[2025-11-29 19:47:50,816] - step:550/900 train_loss:0.9913 lr:0.0003000000 time/step:186.73s
|
| 563 |
+
[2025-11-29 19:50:58,844] - step:551/900 train_loss:0.9948 lr:0.0003000000 time/step:188.02s
|
| 564 |
+
[2025-11-29 19:54:06,212] - step:552/900 train_loss:0.9696 lr:0.0003000000 time/step:187.35s
|
| 565 |
+
[2025-11-29 19:57:12,084] - step:553/900 train_loss:0.9706 lr:0.0003000000 time/step:185.85s
|
| 566 |
+
[2025-11-29 20:00:18,128] - step:554/900 train_loss:0.9871 lr:0.0003000000 time/step:186.03s
|
| 567 |
+
[2025-11-29 20:03:26,623] - step:555/900 train_loss:0.9930 lr:0.0003000000 time/step:188.48s
|
| 568 |
+
[2025-11-29 20:06:33,230] - step:556/900 train_loss:0.9752 lr:0.0003000000 time/step:186.55s
|
| 569 |
+
[2025-11-29 20:09:39,696] - step:557/900 train_loss:0.9850 lr:0.0003000000 time/step:186.45s
|
| 570 |
+
[2025-11-29 20:12:48,229] - step:558/900 train_loss:0.9720 lr:0.0003000000 time/step:188.52s
|
| 571 |
+
[2025-11-29 20:15:53,987] - step:559/900 train_loss:0.9962 lr:0.0003000000 time/step:185.74s
|
| 572 |
+
[2025-11-29 20:19:00,484] - step:560/900 train_loss:0.9922 lr:0.0003000000 time/step:186.48s
|
| 573 |
+
[2025-11-29 20:22:09,247] - step:561/900 train_loss:0.9740 lr:0.0003000000 time/step:188.74s
|
| 574 |
+
[2025-11-29 20:25:16,473] - step:562/900 train_loss:0.9712 lr:0.0003000000 time/step:187.21s
|
| 575 |
+
[2025-11-29 20:28:23,403] - step:563/900 train_loss:0.9612 lr:0.0003000000 time/step:186.92s
|
| 576 |
+
[2025-11-29 20:31:30,909] - step:564/900 train_loss:0.9914 lr:0.0003000000 time/step:187.50s
|
| 577 |
+
[2025-11-29 20:34:38,710] - step:565/900 train_loss:0.9836 lr:0.0003000000 time/step:187.78s
|
| 578 |
+
[2025-11-29 20:37:45,056] - step:566/900 train_loss:0.9814 lr:0.0003000000 time/step:186.33s
|
| 579 |
+
[2025-11-29 20:40:51,873] - step:567/900 train_loss:0.9865 lr:0.0003000000 time/step:186.81s
|
| 580 |
+
[2025-11-29 20:44:00,559] - step:568/900 train_loss:0.9917 lr:0.0003000000 time/step:188.68s
|
| 581 |
+
[2025-11-29 20:47:07,062] - step:569/900 train_loss:0.9644 lr:0.0003000000 time/step:186.48s
|
| 582 |
+
[2025-11-29 20:50:13,303] - step:570/900 train_loss:0.9759 lr:0.0003000000 time/step:186.19s
|
| 583 |
+
[2025-11-29 20:53:21,695] - step:571/900 train_loss:0.9703 lr:0.0003000000 time/step:188.39s
|
| 584 |
+
[2025-11-29 20:56:29,148] - step:572/900 train_loss:0.9713 lr:0.0003000000 time/step:187.43s
|
| 585 |
+
[2025-11-29 20:59:35,993] - step:573/900 train_loss:0.9549 lr:0.0003000000 time/step:186.82s
|
| 586 |
+
[2025-11-29 21:02:44,463] - step:574/900 train_loss:0.9696 lr:0.0003000000 time/step:188.47s
|
| 587 |
+
[2025-11-29 21:05:51,247] - step:575/900 train_loss:0.9648 lr:0.0003000000 time/step:186.77s
|
| 588 |
+
[2025-11-29 21:08:57,001] - step:576/900 train_loss:0.9695 lr:0.0003000000 time/step:185.74s
|
| 589 |
+
[2025-11-29 21:12:03,873] - step:577/900 train_loss:0.9728 lr:0.0003000000 time/step:186.86s
|
| 590 |
+
[2025-11-29 21:15:10,900] - step:578/900 train_loss:0.9767 lr:0.0003000000 time/step:187.02s
|
| 591 |
+
[2025-11-29 21:18:14,501] - step:579/900 train_loss:0.9643 lr:0.0003000000 time/step:183.56s
|
| 592 |
+
[2025-11-29 21:21:16,045] - step:580/900 train_loss:0.9826 lr:0.0003000000 time/step:181.53s
|
| 593 |
+
[2025-11-29 21:24:19,527] - step:581/900 train_loss:0.9792 lr:0.0003000000 time/step:183.48s
|
| 594 |
+
[2025-11-29 21:27:25,340] - step:582/900 train_loss:0.9852 lr:0.0003000000 time/step:185.73s
|
| 595 |
+
[2025-11-29 21:30:32,498] - step:583/900 train_loss:0.9699 lr:0.0003000000 time/step:187.15s
|
| 596 |
+
[2025-11-29 21:33:40,663] - step:584/900 train_loss:0.9709 lr:0.0003000000 time/step:188.14s
|
| 597 |
+
[2025-11-29 21:36:47,891] - step:585/900 train_loss:0.9673 lr:0.0003000000 time/step:187.21s
|
| 598 |
+
[2025-11-29 21:39:54,798] - step:586/900 train_loss:0.9792 lr:0.0003000000 time/step:186.90s
|
| 599 |
+
[2025-11-29 21:43:04,568] - step:587/900 train_loss:0.9784 lr:0.0003000000 time/step:189.77s
|
| 600 |
+
[2025-11-29 21:46:11,882] - step:588/900 train_loss:0.9719 lr:0.0003000000 time/step:187.29s
|
| 601 |
+
[2025-11-29 21:49:18,906] - step:589/900 train_loss:0.9834 lr:0.0003000000 time/step:187.01s
|
| 602 |
+
[2025-11-29 21:52:25,621] - step:590/900 train_loss:0.9659 lr:0.0003000000 time/step:186.70s
|
| 603 |
+
[2025-11-29 21:55:31,655] - step:591/900 train_loss:0.9658 lr:0.0003000000 time/step:185.94s
|
| 604 |
+
[2025-11-29 21:58:38,212] - step:592/900 train_loss:0.9855 lr:0.0003000000 time/step:186.53s
|
| 605 |
+
[2025-11-29 22:01:44,812] - step:593/900 train_loss:0.9691 lr:0.0003000000 time/step:186.59s
|
| 606 |
+
[2025-11-29 22:04:51,951] - step:594/900 train_loss:0.9781 lr:0.0003000000 time/step:187.13s
|
| 607 |
+
[2025-11-29 22:07:57,915] - step:595/900 train_loss:0.9579 lr:0.0003000000 time/step:185.94s
|
| 608 |
+
[2025-11-29 22:11:04,854] - step:596/900 train_loss:0.9731 lr:0.0003000000 time/step:186.91s
|
| 609 |
+
[2025-11-29 22:14:13,434] - step:597/900 train_loss:0.9715 lr:0.0003000000 time/step:188.57s
|
| 610 |
+
[2025-11-29 22:17:20,910] - step:598/900 train_loss:0.9886 lr:0.0003000000 time/step:187.46s
|
| 611 |
+
[2025-11-29 22:20:27,176] - step:599/900 train_loss:0.9657 lr:0.0003000000 time/step:186.24s
|
| 612 |
+
[2025-11-29 22:23:34,717] - step:600/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@600.pt...
|
| 613 |
+
[2025-11-29 22:23:34,723] - step:600/900 train_loss:0.9532 lr:0.0003000000 time/step:185.95s
|
| 614 |
+
[2025-11-29 22:26:38,518] - step:601/900 train_loss:0.9535 lr:0.0003000000 time/step:183.79s
|
| 615 |
+
[2025-11-29 22:29:41,900] - step:602/900 train_loss:0.9374 lr:0.0003000000 time/step:183.35s
|
| 616 |
+
[2025-11-29 22:32:47,468] - step:603/900 train_loss:0.9662 lr:0.0003000000 time/step:185.52s
|
| 617 |
+
[2025-11-29 22:35:53,752] - step:604/900 train_loss:0.9587 lr:0.0003000000 time/step:186.16s
|
| 618 |
+
[2025-11-29 22:38:58,466] - step:605/900 train_loss:0.9739 lr:0.0003000000 time/step:184.70s
|
| 619 |
+
[2025-11-29 22:42:03,657] - step:606/900 train_loss:0.9563 lr:0.0003000000 time/step:185.17s
|
| 620 |
+
[2025-11-29 22:45:12,058] - step:607/900 train_loss:0.9584 lr:0.0003000000 time/step:188.39s
|
| 621 |
+
[2025-11-29 22:48:18,310] - step:608/900 train_loss:0.9694 lr:0.0003000000 time/step:186.23s
|
| 622 |
+
[2025-11-29 22:51:24,367] - step:609/900 train_loss:0.9681 lr:0.0003000000 time/step:186.05s
|
| 623 |
+
[2025-11-29 22:54:31,573] - step:610/900 train_loss:0.9582 lr:0.0003000000 time/step:187.20s
|
| 624 |
+
[2025-11-29 22:57:36,240] - step:611/900 train_loss:0.9781 lr:0.0003000000 time/step:184.66s
|
| 625 |
+
[2025-11-29 23:00:39,793] - step:612/900 train_loss:0.9707 lr:0.0003000000 time/step:183.54s
|
| 626 |
+
[2025-11-29 23:03:48,177] - step:613/900 train_loss:0.9626 lr:0.0003000000 time/step:188.38s
|
| 627 |
+
[2025-11-29 23:06:54,527] - step:614/900 train_loss:0.9525 lr:0.0003000000 time/step:186.34s
|
| 628 |
+
[2025-11-29 23:10:00,576] - step:615/900 train_loss:0.9825 lr:0.0003000000 time/step:186.03s
|
| 629 |
+
[2025-11-29 23:13:06,944] - step:616/900 train_loss:0.9648 lr:0.0003000000 time/step:186.35s
|
| 630 |
+
[2025-11-29 23:16:13,313] - step:617/900 train_loss:0.9833 lr:0.0003000000 time/step:186.36s
|
| 631 |
+
[2025-11-29 23:19:18,008] - step:618/900 train_loss:0.9619 lr:0.0003000000 time/step:184.67s
|
| 632 |
+
[2025-11-29 23:22:23,418] - step:619/900 train_loss:0.9681 lr:0.0003000000 time/step:185.40s
|
| 633 |
+
[2025-11-29 23:25:30,799] - step:620/900 train_loss:0.9705 lr:0.0003000000 time/step:187.36s
|
| 634 |
+
[2025-11-29 23:28:36,096] - step:621/900 train_loss:0.9884 lr:0.0003000000 time/step:185.28s
|
| 635 |
+
[2025-11-29 23:31:40,935] - step:622/900 train_loss:0.9623 lr:0.0003000000 time/step:184.83s
|
| 636 |
+
[2025-11-29 23:34:49,164] - step:623/900 train_loss:0.9781 lr:0.0003000000 time/step:188.22s
|
| 637 |
+
[2025-11-29 23:37:55,808] - step:624/900 train_loss:0.9558 lr:0.0003000000 time/step:186.62s
|
| 638 |
+
[2025-11-29 23:41:02,902] - step:625/900 train_loss:0.9641 lr:0.0003000000 time/step:187.08s
|
| 639 |
+
[2025-11-29 23:44:12,190] - step:626/900 train_loss:0.9631 lr:0.0003000000 time/step:189.26s
|
| 640 |
+
[2025-11-29 23:47:18,211] - step:627/900 train_loss:0.9820 lr:0.0003000000 time/step:185.99s
|
| 641 |
+
[2025-11-29 23:50:22,907] - step:628/900 train_loss:0.9647 lr:0.0003000000 time/step:184.67s
|
| 642 |
+
[2025-11-29 23:53:29,293] - step:629/900 train_loss:0.9504 lr:0.0003000000 time/step:186.38s
|
| 643 |
+
[2025-11-29 23:56:35,007] - step:630/900 train_loss:0.9845 lr:0.0003000000 time/step:185.70s
|
| 644 |
+
[2025-11-29 23:59:41,063] - step:631/900 train_loss:0.9710 lr:0.0003000000 time/step:186.04s
|
| 645 |
+
[2025-11-30 00:02:47,384] - step:632/900 train_loss:0.9673 lr:0.0003000000 time/step:186.31s
|
| 646 |
+
[2025-11-30 00:05:54,675] - step:633/900 train_loss:0.9644 lr:0.0003000000 time/step:187.29s
|
| 647 |
+
[2025-11-30 00:09:00,681] - step:634/900 train_loss:0.9751 lr:0.0003000000 time/step:185.98s
|
| 648 |
+
[2025-11-30 00:12:07,170] - step:635/900 train_loss:0.9427 lr:0.0003000000 time/step:186.47s
|
| 649 |
+
[2025-11-30 00:15:16,394] - step:636/900 train_loss:0.9941 lr:0.0003000000 time/step:189.21s
|
| 650 |
+
[2025-11-30 00:18:21,885] - step:637/900 train_loss:0.9627 lr:0.0003000000 time/step:185.46s
|
| 651 |
+
[2025-11-30 00:21:26,909] - step:638/900 train_loss:0.9713 lr:0.0003000000 time/step:185.01s
|
| 652 |
+
[2025-11-30 00:24:34,518] - step:639/900 train_loss:0.9477 lr:0.0003000000 time/step:187.59s
|
| 653 |
+
[2025-11-30 00:27:39,860] - step:640/900 train_loss:0.9413 lr:0.0003000000 time/step:185.32s
|
| 654 |
+
[2025-11-30 00:30:46,082] - step:641/900 train_loss:0.9583 lr:0.0003000000 time/step:186.18s
|
| 655 |
+
[2025-11-30 00:33:53,085] - step:642/900 train_loss:0.9927 lr:0.0003000000 time/step:186.99s
|
| 656 |
+
[2025-11-30 00:37:00,236] - step:643/900 train_loss:0.9658 lr:0.0003000000 time/step:187.13s
|
| 657 |
+
[2025-11-30 00:40:06,191] - step:644/900 train_loss:0.9532 lr:0.0003000000 time/step:185.92s
|
| 658 |
+
[2025-11-30 00:43:11,626] - step:645/900 train_loss:0.9510 lr:0.0003000000 time/step:185.43s
|
| 659 |
+
[2025-11-30 00:46:16,854] - step:646/900 train_loss:0.9572 lr:0.0003000000 time/step:185.21s
|
| 660 |
+
[2025-11-30 00:49:20,350] - step:647/900 train_loss:0.9524 lr:0.0003000000 time/step:183.47s
|
| 661 |
+
[2025-11-30 00:52:23,936] - step:648/900 train_loss:0.9724 lr:0.0003000000 time/step:183.58s
|
| 662 |
+
[2025-11-30 00:55:32,534] - step:649/900 train_loss:1.0075 lr:0.0003000000 time/step:188.59s
|
| 663 |
+
[2025-11-30 00:58:37,981] - step:650/900 train_loss:0.9637 lr:0.0003000000 time/step:185.43s
|
| 664 |
+
[2025-11-30 01:01:43,633] - step:651/900 train_loss:0.9657 lr:0.0003000000 time/step:185.63s
|
| 665 |
+
[2025-11-30 01:04:53,089] - step:652/900 train_loss:0.9597 lr:0.0003000000 time/step:189.45s
|
| 666 |
+
[2025-11-30 01:08:00,352] - step:653/900 train_loss:0.9692 lr:0.0003000000 time/step:187.22s
|
| 667 |
+
[2025-11-30 01:11:07,645] - step:654/900 train_loss:0.9529 lr:0.0003000000 time/step:187.28s
|
| 668 |
+
[2025-11-30 01:14:14,239] - step:655/900 train_loss:0.9482 lr:0.0003000000 time/step:186.59s
|
| 669 |
+
[2025-11-30 01:17:20,123] - step:656/900 train_loss:0.9579 lr:0.0003000000 time/step:185.88s
|
| 670 |
+
[2025-11-30 01:20:25,496] - step:657/900 train_loss:0.9504 lr:0.0003000000 time/step:185.35s
|
| 671 |
+
[2025-11-30 01:23:31,180] - step:658/900 train_loss:0.9749 lr:0.0003000000 time/step:185.66s
|
| 672 |
+
[2025-11-30 01:26:37,555] - step:659/900 train_loss:0.9706 lr:0.0003000000 time/step:186.35s
|
| 673 |
+
[2025-11-30 01:29:43,411] - step:660/900 train_loss:0.9571 lr:0.0003000000 time/step:185.84s
|
| 674 |
+
[2025-11-30 01:32:49,562] - step:661/900 train_loss:0.9464 lr:0.0003000000 time/step:186.14s
|
| 675 |
+
[2025-11-30 01:35:57,969] - step:662/900 train_loss:0.9430 lr:0.0003000000 time/step:188.40s
|
| 676 |
+
[2025-11-30 01:39:04,057] - step:663/900 train_loss:0.9606 lr:0.0003000000 time/step:186.06s
|
| 677 |
+
[2025-11-30 01:42:08,918] - step:664/900 train_loss:0.9484 lr:0.0003000000 time/step:184.85s
|
| 678 |
+
[2025-11-30 01:45:15,790] - step:665/900 train_loss:0.9660 lr:0.0003000000 time/step:186.86s
|
| 679 |
+
[2025-11-30 01:48:21,042] - step:666/900 train_loss:0.9715 lr:0.0003000000 time/step:185.22s
|
| 680 |
+
[2025-11-30 01:51:25,399] - step:667/900 train_loss:0.9747 lr:0.0003000000 time/step:184.34s
|
| 681 |
+
[2025-11-30 01:54:31,595] - step:668/900 train_loss:0.9405 lr:0.0003000000 time/step:186.18s
|
| 682 |
+
[2025-11-30 01:57:37,951] - step:669/900 train_loss:0.9562 lr:0.0003000000 time/step:186.34s
|
| 683 |
+
[2025-11-30 02:00:44,059] - step:670/900 train_loss:0.9800 lr:0.0003000000 time/step:186.09s
|
| 684 |
+
[2025-11-30 02:03:49,586] - step:671/900 train_loss:0.9646 lr:0.0003000000 time/step:185.52s
|
| 685 |
+
[2025-11-30 02:06:57,124] - step:672/900 train_loss:0.9656 lr:0.0003000000 time/step:187.53s
|
| 686 |
+
[2025-11-30 02:10:03,956] - step:673/900 train_loss:0.9544 lr:0.0003000000 time/step:186.80s
|
| 687 |
+
[2025-11-30 02:13:09,941] - step:674/900 train_loss:0.9604 lr:0.0003000000 time/step:185.98s
|
| 688 |
+
[2025-11-30 02:16:17,892] - step:675/900 train_loss:0.9639 lr:0.0003000000 time/step:187.95s
|
| 689 |
+
[2025-11-30 02:19:23,974] - step:676/900 train_loss:0.9455 lr:0.0003000000 time/step:186.05s
|
| 690 |
+
[2025-11-30 02:22:30,221] - step:677/900 train_loss:0.9509 lr:0.0003000000 time/step:186.20s
|
| 691 |
+
[2025-11-30 02:25:37,961] - step:678/900 train_loss:0.9363 lr:0.0003000000 time/step:187.73s
|
| 692 |
+
[2025-11-30 02:28:44,267] - step:679/900 train_loss:0.9520 lr:0.0003000000 time/step:186.29s
|
| 693 |
+
[2025-11-30 02:31:50,617] - step:680/900 train_loss:0.9565 lr:0.0003000000 time/step:186.34s
|
| 694 |
+
[2025-11-30 02:34:58,672] - step:681/900 train_loss:0.9727 lr:0.0003000000 time/step:188.04s
|
| 695 |
+
[2025-11-30 02:38:05,140] - step:682/900 train_loss:0.9563 lr:0.0003000000 time/step:186.46s
|
| 696 |
+
[2025-11-30 02:41:09,992] - step:683/900 train_loss:0.9809 lr:0.0003000000 time/step:184.79s
|
| 697 |
+
[2025-11-30 02:44:15,338] - step:684/900 train_loss:0.9526 lr:0.0003000000 time/step:185.34s
|
| 698 |
+
[2025-11-30 02:47:21,385] - step:685/900 train_loss:0.9675 lr:0.0003000000 time/step:186.04s
|
| 699 |
+
[2025-11-30 02:50:25,872] - step:686/900 train_loss:0.9466 lr:0.0003000000 time/step:184.44s
|
| 700 |
+
[2025-11-30 02:53:31,333] - step:687/900 train_loss:0.9575 lr:0.0003000000 time/step:185.43s
|
| 701 |
+
[2025-11-30 02:56:38,782] - step:688/900 train_loss:0.9673 lr:0.0003000000 time/step:187.43s
|
| 702 |
+
[2025-11-30 02:59:44,702] - step:689/900 train_loss:0.9582 lr:0.0003000000 time/step:185.90s
|
| 703 |
+
[2025-11-30 03:02:50,929] - step:690/900 train_loss:0.9581 lr:0.0003000000 time/step:186.22s
|
| 704 |
+
[2025-11-30 03:05:57,119] - step:691/900 train_loss:0.9407 lr:0.0003000000 time/step:186.18s
|
| 705 |
+
[2025-11-30 03:09:02,467] - step:692/900 train_loss:0.9567 lr:0.0003000000 time/step:185.33s
|
| 706 |
+
[2025-11-30 03:12:07,335] - step:693/900 train_loss:0.9362 lr:0.0003000000 time/step:184.84s
|
| 707 |
+
[2025-11-30 03:15:14,078] - step:694/900 train_loss:0.9692 lr:0.0003000000 time/step:186.74s
|
| 708 |
+
[2025-11-30 03:18:20,680] - step:695/900 train_loss:0.9288 lr:0.0003000000 time/step:186.58s
|
| 709 |
+
[2025-11-30 03:21:26,753] - step:696/900 train_loss:0.9616 lr:0.0003000000 time/step:186.05s
|
| 710 |
+
[2025-11-30 03:24:32,423] - step:697/900 train_loss:0.9203 lr:0.0003000000 time/step:185.66s
|
| 711 |
+
[2025-11-30 03:27:41,442] - step:698/900 train_loss:0.9552 lr:0.0003000000 time/step:189.01s
|
| 712 |
+
[2025-11-30 03:30:45,196] - step:699/900 train_loss:0.9601 lr:0.0003000000 time/step:183.72s
|
| 713 |
+
[2025-11-30 03:33:51,669] - step:700/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@700.pt...
|
| 714 |
+
[2025-11-30 03:33:51,671] - step:700/900 train_loss:0.9515 lr:0.0003000000 time/step:184.76s
|
| 715 |
+
[2025-11-30 03:36:59,452] - step:701/900 train_loss:0.9587 lr:0.0003000000 time/step:187.77s
|
| 716 |
+
[2025-11-30 03:40:05,996] - step:702/900 train_loss:0.9688 lr:0.0003000000 time/step:186.44s
|
| 717 |
+
[2025-11-30 03:43:12,080] - step:703/900 train_loss:0.9386 lr:0.0003000000 time/step:186.06s
|
| 718 |
+
[2025-11-30 03:46:19,965] - step:704/900 train_loss:0.9925 lr:0.0003000000 time/step:187.88s
|
| 719 |
+
[2025-11-30 03:49:25,758] - step:705/900 train_loss:0.9425 lr:0.0003000000 time/step:185.77s
|
| 720 |
+
[2025-11-30 03:52:30,961] - step:706/900 train_loss:0.9720 lr:0.0003000000 time/step:185.19s
|
| 721 |
+
[2025-11-30 03:55:39,418] - step:707/900 train_loss:0.9434 lr:0.0003000000 time/step:188.44s
|
| 722 |
+
[2025-11-30 03:58:47,467] - step:708/900 train_loss:0.9549 lr:0.0003000000 time/step:188.03s
|
| 723 |
+
[2025-11-30 04:01:53,611] - step:709/900 train_loss:0.9511 lr:0.0003000000 time/step:186.12s
|
| 724 |
+
[2025-11-30 04:04:58,923] - step:710/900 train_loss:0.9714 lr:0.0003000000 time/step:185.31s
|
| 725 |
+
[2025-11-30 04:08:06,914] - step:711/900 train_loss:0.9647 lr:0.0003000000 time/step:187.98s
|
| 726 |
+
[2025-11-30 04:11:12,929] - step:712/900 train_loss:0.9789 lr:0.0003000000 time/step:185.96s
|
| 727 |
+
[2025-11-30 04:14:19,154] - step:713/900 train_loss:0.9418 lr:0.0003000000 time/step:186.22s
|
| 728 |
+
[2025-11-30 04:17:27,669] - step:714/900 train_loss:0.9417 lr:0.0003000000 time/step:188.50s
|
| 729 |
+
[2025-11-30 04:20:32,769] - step:715/900 train_loss:0.9507 lr:0.0003000000 time/step:185.08s
|
| 730 |
+
[2025-11-30 04:23:37,756] - step:716/900 train_loss:0.9567 lr:0.0003000000 time/step:184.98s
|
| 731 |
+
[2025-11-30 04:26:45,091] - step:717/900 train_loss:0.9389 lr:0.0003000000 time/step:187.32s
|
| 732 |
+
[2025-11-30 04:29:50,043] - step:718/900 train_loss:0.9477 lr:0.0003000000 time/step:184.87s
|
| 733 |
+
[2025-11-30 04:32:53,971] - step:719/900 train_loss:0.9619 lr:0.0003000000 time/step:183.92s
|
| 734 |
+
[2025-11-30 04:36:00,320] - step:720/900 train_loss:0.9533 lr:0.0003000000 time/step:186.34s
|
| 735 |
+
[2025-11-30 04:39:07,896] - step:721/900 train_loss:0.9650 lr:0.0003000000 time/step:187.55s
|
| 736 |
+
[2025-11-30 04:42:13,833] - step:722/900 train_loss:0.9603 lr:0.0003000000 time/step:185.91s
|
| 737 |
+
[2025-11-30 04:45:20,122] - step:723/900 train_loss:0.9604 lr:0.0003000000 time/step:186.28s
|
| 738 |
+
[2025-11-30 04:48:28,513] - step:724/900 train_loss:0.9635 lr:0.0003000000 time/step:188.38s
|
| 739 |
+
[2025-11-30 04:51:34,485] - step:725/900 train_loss:0.9550 lr:0.0003000000 time/step:185.94s
|
| 740 |
+
[2025-11-30 04:54:40,827] - step:726/900 train_loss:0.9679 lr:0.0003000000 time/step:186.34s
|
| 741 |
+
[2025-11-30 04:57:50,319] - step:727/900 train_loss:0.9607 lr:0.0003000000 time/step:189.46s
|
| 742 |
+
[2025-11-30 05:00:56,724] - step:728/900 train_loss:0.9880 lr:0.0003000000 time/step:186.35s
|
| 743 |
+
[2025-11-30 05:04:02,482] - step:729/900 train_loss:0.9358 lr:0.0003000000 time/step:185.75s
|
| 744 |
+
[2025-11-30 05:07:10,367] - step:730/900 train_loss:0.9521 lr:0.0003000000 time/step:187.88s
|
| 745 |
+
[2025-11-30 05:10:16,528] - step:731/900 train_loss:0.9466 lr:0.0003000000 time/step:186.13s
|
| 746 |
+
[2025-11-30 05:13:22,743] - step:732/900 train_loss:0.9481 lr:0.0003000000 time/step:186.21s
|
| 747 |
+
[2025-11-30 05:16:29,572] - step:733/900 train_loss:0.9613 lr:0.0003000000 time/step:186.81s
|
| 748 |
+
[2025-11-30 05:19:37,538] - step:734/900 train_loss:0.9525 lr:0.0003000000 time/step:187.96s
|
| 749 |
+
[2025-11-30 05:22:40,037] - step:735/900 train_loss:0.9457 lr:0.0003000000 time/step:182.48s
|
| 750 |
+
[2025-11-30 05:25:46,040] - step:736/900 train_loss:0.9572 lr:0.0003000000 time/step:185.97s
|
| 751 |
+
[2025-11-30 05:29:04,236] - step:737/900 train_loss:0.9545 lr:0.0003000000 time/step:196.30s
|
| 752 |
+
[2025-11-30 05:32:09,010] - step:738/900 train_loss:0.9633 lr:0.0003000000 time/step:184.76s
|
| 753 |
+
[2025-11-30 05:35:14,741] - step:739/900 train_loss:0.9598 lr:0.0003000000 time/step:185.72s
|
| 754 |
+
[2025-11-30 05:38:21,357] - step:740/900 train_loss:0.9342 lr:0.0003000000 time/step:186.60s
|
| 755 |
+
[2025-11-30 05:41:26,232] - step:741/900 train_loss:0.9550 lr:0.0003000000 time/step:184.84s
|
| 756 |
+
[2025-11-30 05:44:31,223] - step:742/900 train_loss:0.9696 lr:0.0003000000 time/step:184.98s
|
| 757 |
+
[2025-11-30 05:47:42,033] - step:743/900 train_loss:0.9468 lr:0.0003000000 time/step:190.80s
|
| 758 |
+
[2025-11-30 05:50:47,075] - step:744/900 train_loss:0.9588 lr:0.0003000000 time/step:184.98s
|
| 759 |
+
[2025-11-30 05:53:51,033] - step:745/900 train_loss:0.9498 lr:0.0003000000 time/step:183.94s
|
| 760 |
+
[2025-11-30 05:56:57,016] - step:746/900 train_loss:0.9529 lr:0.0003000000 time/step:185.97s
|
| 761 |
+
[2025-11-30 06:00:01,884] - step:747/900 train_loss:0.9376 lr:0.0003000000 time/step:184.84s
|
| 762 |
+
[2025-11-30 06:03:06,392] - step:748/900 train_loss:0.9415 lr:0.0003000000 time/step:184.49s
|
| 763 |
+
[2025-11-30 06:06:13,954] - step:749/900 train_loss:0.9581 lr:0.0003000000 time/step:187.55s
|
| 764 |
+
[2025-11-30 06:09:18,747] - step:750/900 train_loss:0.9494 lr:0.0003000000 time/step:184.77s
|
| 765 |
+
[2025-11-30 06:12:24,279] - step:751/900 train_loss:0.9586 lr:0.0003000000 time/step:185.52s
|
| 766 |
+
[2025-11-30 06:15:30,040] - step:752/900 train_loss:0.9491 lr:0.0003000000 time/step:185.75s
|
| 767 |
+
[2025-11-30 06:18:37,170] - step:753/900 train_loss:0.9585 lr:0.0003000000 time/step:187.12s
|
| 768 |
+
[2025-11-30 06:21:42,398] - step:754/900 train_loss:0.9441 lr:0.0003000000 time/step:185.20s
|
| 769 |
+
[2025-11-30 06:24:48,671] - step:755/900 train_loss:0.9533 lr:0.0003000000 time/step:186.25s
|
| 770 |
+
[2025-11-30 06:27:56,633] - step:756/900 train_loss:0.9433 lr:0.0003000000 time/step:187.94s
|
| 771 |
+
[2025-11-30 06:31:02,691] - step:757/900 train_loss:0.9368 lr:0.0003000000 time/step:186.01s
|
| 772 |
+
[2025-11-30 06:34:08,615] - step:758/900 train_loss:0.9504 lr:0.0003000000 time/step:185.91s
|
| 773 |
+
[2025-11-30 06:37:15,950] - step:759/900 train_loss:0.9412 lr:0.0003000000 time/step:187.31s
|
| 774 |
+
[2025-11-30 06:40:22,539] - step:760/900 train_loss:0.9330 lr:0.0003000000 time/step:186.51s
|
| 775 |
+
[2025-11-30 06:43:28,876] - step:761/900 train_loss:0.9342 lr:0.0003000000 time/step:186.33s
|
| 776 |
+
[2025-11-30 06:46:36,580] - step:762/900 train_loss:0.9329 lr:0.0003000000 time/step:187.68s
|
| 777 |
+
[2025-11-30 06:49:43,404] - step:763/900 train_loss:0.9465 lr:0.0003000000 time/step:186.79s
|
| 778 |
+
[2025-11-30 06:52:49,437] - step:764/900 train_loss:0.9507 lr:0.0003000000 time/step:186.01s
|
| 779 |
+
[2025-11-30 06:55:55,801] - step:765/900 train_loss:0.9754 lr:0.0003000000 time/step:186.35s
|
| 780 |
+
[2025-11-30 06:59:04,165] - step:766/900 train_loss:0.9323 lr:0.0003000000 time/step:188.35s
|
| 781 |
+
[2025-11-30 07:02:09,611] - step:767/900 train_loss:0.9398 lr:0.0003000000 time/step:185.37s
|
| 782 |
+
[2025-11-30 07:05:15,543] - step:768/900 train_loss:0.9773 lr:0.0003000000 time/step:185.92s
|
| 783 |
+
[2025-11-30 07:08:23,040] - step:769/900 train_loss:0.9300 lr:0.0003000000 time/step:187.49s
|
| 784 |
+
[2025-11-30 07:11:27,989] - step:770/900 train_loss:0.9565 lr:0.0003000000 time/step:184.93s
|
| 785 |
+
[2025-11-30 07:14:34,166] - step:771/900 train_loss:0.9791 lr:0.0003000000 time/step:186.17s
|
| 786 |
+
[2025-11-30 07:17:41,334] - step:772/900 train_loss:0.9323 lr:0.0003000000 time/step:187.15s
|
| 787 |
+
[2025-11-30 07:20:48,245] - step:773/900 train_loss:0.9384 lr:0.0003000000 time/step:186.89s
|
| 788 |
+
[2025-11-30 07:23:55,000] - step:774/900 train_loss:0.9620 lr:0.0003000000 time/step:186.75s
|
| 789 |
+
[2025-11-30 07:27:04,805] - step:775/900 train_loss:0.9535 lr:0.0003000000 time/step:189.79s
|
| 790 |
+
[2025-11-30 07:30:11,933] - step:776/900 train_loss:0.9500 lr:0.0003000000 time/step:187.11s
|
| 791 |
+
[2025-11-30 07:33:18,809] - step:777/900 train_loss:0.9556 lr:0.0003000000 time/step:186.84s
|
| 792 |
+
[2025-11-30 07:36:25,083] - step:778/900 train_loss:0.9280 lr:0.0003000000 time/step:186.27s
|
| 793 |
+
[2025-11-30 07:39:34,387] - step:779/900 train_loss:0.9373 lr:0.0003000000 time/step:189.30s
|
| 794 |
+
[2025-11-30 07:42:40,512] - step:780/900 train_loss:0.9556 lr:0.0003000000 time/step:186.10s
|
| 795 |
+
[2025-11-30 07:45:46,330] - step:781/900 train_loss:0.9568 lr:0.0003000000 time/step:185.80s
|
| 796 |
+
[2025-11-30 07:48:53,453] - step:782/900 train_loss:0.9737 lr:0.0003000000 time/step:187.11s
|
| 797 |
+
[2025-11-30 07:51:59,918] - step:783/900 train_loss:0.9267 lr:0.0003000000 time/step:186.44s
|
| 798 |
+
[2025-11-30 07:55:06,653] - step:784/900 train_loss:0.9683 lr:0.0003000000 time/step:186.73s
|
| 799 |
+
[2025-11-30 07:58:14,271] - step:785/900 train_loss:0.9249 lr:0.0003000000 time/step:187.60s
|
| 800 |
+
[2025-11-30 08:01:21,688] - step:786/900 train_loss:0.9586 lr:0.0003000000 time/step:187.32s
|
| 801 |
+
[2025-11-30 08:04:28,087] - step:787/900 train_loss:0.9470 lr:0.0003000000 time/step:186.39s
|
| 802 |
+
[2025-11-30 08:07:34,899] - step:788/900 train_loss:0.9591 lr:0.0003000000 time/step:186.79s
|
| 803 |
+
[2025-11-30 08:10:41,723] - step:789/900 train_loss:0.9433 lr:0.0003000000 time/step:186.81s
|
| 804 |
+
[2025-11-30 08:13:47,670] - step:790/900 train_loss:0.9496 lr:0.0003000000 time/step:185.92s
|
| 805 |
+
[2025-11-30 08:16:53,831] - step:791/900 train_loss:0.9459 lr:0.0003000000 time/step:186.15s
|
| 806 |
+
[2025-11-30 08:20:01,102] - step:792/900 train_loss:0.9601 lr:0.0003000000 time/step:187.25s
|
| 807 |
+
[2025-11-30 08:23:06,763] - step:793/900 train_loss:0.9408 lr:0.0003000000 time/step:185.64s
|
| 808 |
+
[2025-11-30 08:26:12,187] - step:794/900 train_loss:0.9571 lr:0.0003000000 time/step:185.41s
|
| 809 |
+
[2025-11-30 08:29:18,964] - step:795/900 train_loss:0.9670 lr:0.0003000000 time/step:186.77s
|
| 810 |
+
[2025-11-30 08:32:24,852] - step:796/900 train_loss:0.9432 lr:0.0003000000 time/step:185.86s
|
| 811 |
+
[2025-11-30 08:35:30,345] - step:797/900 train_loss:0.9347 lr:0.0003000000 time/step:185.49s
|
| 812 |
+
[2025-11-30 08:38:37,005] - step:798/900 train_loss:0.9431 lr:0.0003000000 time/step:186.65s
|
| 813 |
+
[2025-11-30 08:41:44,291] - step:799/900 train_loss:0.9548 lr:0.0003000000 time/step:187.24s
|
| 814 |
+
[2025-11-30 08:44:52,246] - step:800/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@800.pt...
|
| 815 |
+
[2025-11-30 08:44:52,247] - step:800/900 train_loss:0.9472 lr:0.0003000000 time/step:186.19s
|
| 816 |
+
[2025-11-30 08:47:59,432] - step:801/900 train_loss:0.9580 lr:0.0003000000 time/step:187.17s
|
| 817 |
+
[2025-11-30 08:51:07,677] - step:802/900 train_loss:0.9347 lr:0.0003000000 time/step:188.21s
|
| 818 |
+
[2025-11-30 08:54:13,610] - step:803/900 train_loss:0.9552 lr:0.0003000000 time/step:185.91s
|
| 819 |
+
[2025-11-30 08:57:20,822] - step:804/900 train_loss:0.9433 lr:0.0003000000 time/step:187.20s
|
| 820 |
+
[2025-11-30 09:00:27,788] - step:805/900 train_loss:0.9725 lr:0.0003000000 time/step:186.95s
|
| 821 |
+
[2025-11-30 09:03:33,451] - step:806/900 train_loss:0.9319 lr:0.0003000000 time/step:185.63s
|
| 822 |
+
[2025-11-30 09:06:38,728] - step:807/900 train_loss:0.9416 lr:0.0003000000 time/step:185.26s
|
| 823 |
+
[2025-11-30 09:09:45,243] - step:808/900 train_loss:0.9305 lr:0.0003000000 time/step:186.49s
|
| 824 |
+
[2025-11-30 09:12:51,277] - step:809/900 train_loss:0.9611 lr:0.0003000000 time/step:186.01s
|
| 825 |
+
[2025-11-30 09:15:57,601] - step:810/900 train_loss:0.9333 lr:0.0003000000 time/step:186.32s
|
| 826 |
+
[2025-11-30 09:19:05,758] - step:811/900 train_loss:0.9224 lr:0.0003000000 time/step:188.14s
|
| 827 |
+
[2025-11-30 09:22:13,002] - step:812/900 train_loss:0.9311 lr:0.0003000000 time/step:187.20s
|
| 828 |
+
[2025-11-30 09:25:19,895] - step:813/900 train_loss:0.9344 lr:0.0003000000 time/step:186.89s
|
| 829 |
+
[2025-11-30 09:28:27,220] - step:814/900 train_loss:0.9558 lr:0.0003000000 time/step:187.31s
|
| 830 |
+
[2025-11-30 09:31:34,977] - step:815/900 train_loss:0.9603 lr:0.0003000000 time/step:187.65s
|
| 831 |
+
[2025-11-30 09:34:41,403] - step:816/900 train_loss:0.9405 lr:0.0003000000 time/step:186.42s
|
| 832 |
+
[2025-11-30 09:37:48,995] - step:817/900 train_loss:0.9620 lr:0.0003000000 time/step:187.58s
|
| 833 |
+
[2025-11-30 09:40:56,468] - step:818/900 train_loss:0.9415 lr:0.0003000000 time/step:187.44s
|
| 834 |
+
[2025-11-30 09:44:02,893] - step:819/900 train_loss:0.9391 lr:0.0003000000 time/step:186.41s
|
| 835 |
+
[2025-11-30 09:47:08,977] - step:820/900 train_loss:0.9551 lr:0.0003000000 time/step:186.08s
|
| 836 |
+
[2025-11-30 09:50:18,823] - step:821/900 train_loss:0.9585 lr:0.0003000000 time/step:189.84s
|
| 837 |
+
[2025-11-30 09:53:24,812] - step:822/900 train_loss:0.9449 lr:0.0003000000 time/step:185.94s
|
| 838 |
+
[2025-11-30 09:56:30,627] - step:823/900 train_loss:0.9446 lr:0.0003000000 time/step:185.81s
|
| 839 |
+
[2025-11-30 09:59:38,002] - step:824/900 train_loss:0.9589 lr:0.0003000000 time/step:187.37s
|
| 840 |
+
[2025-11-30 10:02:45,105] - step:825/900 train_loss:0.9660 lr:0.0003000000 time/step:187.08s
|
| 841 |
+
[2025-11-30 10:05:50,874] - step:826/900 train_loss:0.9403 lr:0.0003000000 time/step:185.75s
|
| 842 |
+
[2025-11-30 10:08:57,938] - step:827/900 train_loss:0.9435 lr:0.0003000000 time/step:187.06s
|
| 843 |
+
[2025-11-30 10:12:06,575] - step:828/900 train_loss:0.9462 lr:0.0003000000 time/step:188.61s
|
| 844 |
+
[2025-11-30 10:15:12,880] - step:829/900 train_loss:0.9383 lr:0.0003000000 time/step:186.30s
|
| 845 |
+
[2025-11-30 10:18:19,887] - step:830/900 train_loss:0.9513 lr:0.0003000000 time/step:187.00s
|
| 846 |
+
[2025-11-30 10:21:26,611] - step:831/900 train_loss:0.9434 lr:0.0003000000 time/step:186.71s
|
| 847 |
+
[2025-11-30 10:24:32,173] - step:832/900 train_loss:0.9277 lr:0.0003000000 time/step:185.55s
|
| 848 |
+
[2025-11-30 10:27:38,320] - step:833/900 train_loss:0.9638 lr:0.0003000000 time/step:186.13s
|
| 849 |
+
[2025-11-30 10:30:45,530] - step:834/900 train_loss:0.9344 lr:0.0003000000 time/step:187.17s
|
| 850 |
+
[2025-11-30 10:33:51,305] - step:835/900 train_loss:0.9318 lr:0.0003000000 time/step:185.76s
|
| 851 |
+
[2025-11-30 10:36:56,815] - step:836/900 train_loss:0.9660 lr:0.0003000000 time/step:185.50s
|
| 852 |
+
[2025-11-30 10:40:03,772] - step:837/900 train_loss:0.9189 lr:0.0003000000 time/step:186.95s
|
| 853 |
+
[2025-11-30 10:43:10,385] - step:838/900 train_loss:0.9294 lr:0.0003000000 time/step:186.60s
|
| 854 |
+
[2025-11-30 10:46:16,106] - step:839/900 train_loss:0.9562 lr:0.0003000000 time/step:185.71s
|
| 855 |
+
[2025-11-30 10:49:23,701] - step:840/900 train_loss:0.9308 lr:0.0003000000 time/step:187.59s
|
| 856 |
+
[2025-11-30 10:52:31,109] - step:841/900 train_loss:0.9446 lr:0.0003000000 time/step:187.37s
|
| 857 |
+
[2025-11-30 10:55:36,766] - step:842/900 train_loss:0.9646 lr:0.0003000000 time/step:185.64s
|
| 858 |
+
[2025-11-30 10:58:43,991] - step:843/900 train_loss:0.9662 lr:0.0003000000 time/step:187.22s
|
| 859 |
+
[2025-11-30 11:01:51,907] - step:844/900 train_loss:0.9557 lr:0.0003000000 time/step:187.91s
|
| 860 |
+
[2025-11-30 11:04:58,972] - step:845/900 train_loss:0.9409 lr:0.0003000000 time/step:187.04s
|
| 861 |
+
[2025-11-30 11:08:05,417] - step:846/900 train_loss:0.9277 lr:0.0003000000 time/step:186.44s
|
| 862 |
+
[2025-11-30 11:11:12,807] - step:847/900 train_loss:0.9310 lr:0.0003000000 time/step:187.37s
|
| 863 |
+
[2025-11-30 11:14:18,599] - step:848/900 train_loss:0.9528 lr:0.0003000000 time/step:185.78s
|
| 864 |
+
[2025-11-30 11:17:24,283] - step:849/900 train_loss:0.9435 lr:0.0003000000 time/step:185.67s
|
| 865 |
+
[2025-11-30 11:20:29,958] - step:850/900 train_loss:0.9328 lr:0.0003000000 time/step:185.67s
|
| 866 |
+
[2025-11-30 11:23:37,238] - step:851/900 train_loss:0.9586 lr:0.0003000000 time/step:187.25s
|
| 867 |
+
[2025-11-30 11:26:43,509] - step:852/900 train_loss:0.9788 lr:0.0003000000 time/step:186.26s
|
| 868 |
+
[2025-11-30 11:29:50,577] - step:853/900 train_loss:0.9598 lr:0.0003000000 time/step:187.04s
|
| 869 |
+
[2025-11-30 11:32:58,592] - step:854/900 train_loss:0.9314 lr:0.0003000000 time/step:187.98s
|
| 870 |
+
[2025-11-30 11:36:05,368] - step:855/900 train_loss:0.9431 lr:0.0003000000 time/step:186.76s
|
| 871 |
+
[2025-11-30 11:39:14,068] - step:856/900 train_loss:0.9402 lr:0.0003000000 time/step:188.69s
|
| 872 |
+
[2025-11-30 11:42:21,149] - step:857/900 train_loss:0.9406 lr:0.0003000000 time/step:187.03s
|
| 873 |
+
[2025-11-30 11:45:27,269] - step:858/900 train_loss:0.9517 lr:0.0003000000 time/step:186.10s
|
| 874 |
+
[2025-11-30 11:48:33,589] - step:859/900 train_loss:0.9288 lr:0.0003000000 time/step:186.29s
|
| 875 |
+
[2025-11-30 11:51:41,526] - step:860/900 train_loss:0.9489 lr:0.0003000000 time/step:187.92s
|
| 876 |
+
[2025-11-30 11:54:48,310] - step:861/900 train_loss:0.9242 lr:0.0003000000 time/step:186.76s
|
| 877 |
+
[2025-11-30 11:57:55,433] - step:862/900 train_loss:0.9465 lr:0.0003000000 time/step:187.12s
|
| 878 |
+
[2025-11-30 12:01:02,214] - step:863/900 train_loss:0.9319 lr:0.0003000000 time/step:186.77s
|
| 879 |
+
[2025-11-30 12:04:10,157] - step:864/900 train_loss:0.9561 lr:0.0003000000 time/step:187.93s
|
| 880 |
+
[2025-11-30 12:07:16,580] - step:865/900 train_loss:0.9531 lr:0.0003000000 time/step:186.41s
|
| 881 |
+
[2025-11-30 12:10:24,225] - step:866/900 train_loss:0.9716 lr:0.0003000000 time/step:187.64s
|
| 882 |
+
[2025-11-30 12:13:32,116] - step:867/900 train_loss:0.9523 lr:0.0003000000 time/step:187.86s
|
| 883 |
+
[2025-11-30 12:16:38,751] - step:868/900 train_loss:0.9485 lr:0.0003000000 time/step:186.63s
|
| 884 |
+
[2025-11-30 12:19:46,482] - step:869/900 train_loss:0.9338 lr:0.0003000000 time/step:187.71s
|
| 885 |
+
[2025-11-30 12:22:55,768] - step:870/900 train_loss:0.9071 lr:0.0003000000 time/step:189.25s
|
| 886 |
+
[2025-11-30 12:26:03,017] - step:871/900 train_loss:0.9349 lr:0.0003000000 time/step:187.24s
|
| 887 |
+
[2025-11-30 12:29:11,277] - step:872/900 train_loss:0.9171 lr:0.0003000000 time/step:188.25s
|
| 888 |
+
[2025-11-30 12:32:17,975] - step:873/900 train_loss:0.9291 lr:0.0003000000 time/step:186.69s
|
| 889 |
+
[2025-11-30 12:35:23,652] - step:874/900 train_loss:0.9496 lr:0.0003000000 time/step:185.66s
|
| 890 |
+
[2025-11-30 12:38:30,634] - step:875/900 train_loss:0.9004 lr:0.0003000000 time/step:186.96s
|
| 891 |
+
[2025-11-30 12:41:36,335] - step:876/900 train_loss:0.9638 lr:0.0003000000 time/step:185.69s
|
| 892 |
+
[2025-11-30 12:44:43,698] - step:877/900 train_loss:0.9303 lr:0.0003000000 time/step:187.35s
|
| 893 |
+
[2025-11-30 12:47:49,883] - step:878/900 train_loss:0.9308 lr:0.0003000000 time/step:186.17s
|
| 894 |
+
[2025-11-30 12:50:57,119] - step:879/900 train_loss:0.9567 lr:0.0003000000 time/step:187.23s
|
| 895 |
+
[2025-11-30 12:54:05,570] - step:880/900 train_loss:0.9294 lr:0.0003000000 time/step:188.35s
|
| 896 |
+
[2025-11-30 12:57:11,908] - step:881/900 train_loss:0.9243 lr:0.0003000000 time/step:186.32s
|
| 897 |
+
[2025-11-30 13:00:18,512] - step:882/900 train_loss:0.9372 lr:0.0003000000 time/step:186.59s
|
| 898 |
+
[2025-11-30 13:03:25,956] - step:883/900 train_loss:0.9677 lr:0.0003000000 time/step:187.41s
|
| 899 |
+
[2025-11-30 13:06:32,030] - step:884/900 train_loss:0.9502 lr:0.0003000000 time/step:186.05s
|
| 900 |
+
[2025-11-30 13:09:38,746] - step:885/900 train_loss:0.9309 lr:0.0003000000 time/step:186.69s
|
| 901 |
+
[2025-11-30 13:12:45,777] - step:886/900 train_loss:0.9468 lr:0.0003000000 time/step:186.96s
|
| 902 |
+
[2025-11-30 13:15:51,552] - step:887/900 train_loss:0.9319 lr:0.0003000000 time/step:185.76s
|
| 903 |
+
[2025-11-30 13:18:58,191] - step:888/900 train_loss:0.9400 lr:0.0003000000 time/step:186.63s
|
| 904 |
+
[2025-11-30 13:22:04,951] - step:889/900 train_loss:0.9518 lr:0.0003000000 time/step:186.75s
|
| 905 |
+
[2025-11-30 13:25:13,370] - step:890/900 train_loss:0.9375 lr:0.0003000000 time/step:188.37s
|
| 906 |
+
[2025-11-30 13:28:19,675] - step:891/900 train_loss:0.9699 lr:0.0003000000 time/step:186.29s
|
| 907 |
+
[2025-11-30 13:31:27,143] - step:892/900 train_loss:0.9479 lr:0.0003000000 time/step:187.46s
|
| 908 |
+
[2025-11-30 13:34:34,338] - step:893/900 train_loss:0.9351 lr:0.0003000000 time/step:187.14s
|
| 909 |
+
[2025-11-30 13:37:40,472] - step:894/900 train_loss:0.9767 lr:0.0003000000 time/step:186.13s
|
| 910 |
+
[2025-11-30 13:40:48,083] - step:895/900 train_loss:0.9475 lr:0.0003000000 time/step:187.60s
|
| 911 |
+
[2025-11-30 13:43:55,818] - step:896/900 train_loss:0.9617 lr:0.0003000000 time/step:187.71s
|
| 912 |
+
[2025-11-30 13:47:01,872] - step:897/900 train_loss:0.9549 lr:0.0003000000 time/step:186.04s
|
| 913 |
+
[2025-11-30 13:50:09,170] - step:898/900 train_loss:0.9324 lr:0.0003000000 time/step:187.29s
|
| 914 |
+
[2025-11-30 13:53:18,377] - step:899/900 train_loss:0.9573 lr:0.0003000000 time/step:189.17s
|
| 915 |
+
[2025-11-30 13:56:26,957] - step:900/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@900.pt...
|
| 916 |
+
[2025-11-30 13:56:26,959] - step:900/900 train_loss:0.9387 lr:0.0003000000 time/step:186.63s
|
wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-11-28T15:19:48.477578816Z","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-11-28T15:19:48.699009069Z","level":"INFO","msg":"created new stream","id":"j8dmy8fe"}
|
| 3 |
+
{"time":"2025-11-28T15:19:48.699097403Z","level":"INFO","msg":"stream: started","id":"j8dmy8fe"}
|
| 4 |
+
{"time":"2025-11-28T15:19:48.699172779Z","level":"INFO","msg":"writer: Do: started","stream_id":"j8dmy8fe"}
|
| 5 |
+
{"time":"2025-11-28T15:19:48.699248275Z","level":"INFO","msg":"handler: started","stream_id":"j8dmy8fe"}
|
| 6 |
+
{"time":"2025-11-28T15:19:48.699271222Z","level":"INFO","msg":"sender: started","stream_id":"j8dmy8fe"}
|
| 7 |
+
{"time":"2025-11-28T15:19:49.160995748Z","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-11-28T20:38:35.352286792Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 9 |
+
{"time":"2025-11-30T13:56:28.774510068Z","level":"INFO","msg":"stream: closing","id":"j8dmy8fe"}
|
| 10 |
+
{"time":"2025-11-30T13:56:28.784684257Z","level":"INFO","msg":"Stopping system monitor"}
|
| 11 |
+
{"time":"2025-11-30T13:56:29.045992888Z","level":"INFO","msg":"Stopped system monitor"}
|
| 12 |
+
{"time":"2025-11-30T13:56:30.296345529Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 13 |
+
{"time":"2025-11-30T13:56:30.566218828Z","level":"INFO","msg":"handler: closed","stream_id":"j8dmy8fe"}
|
| 14 |
+
{"time":"2025-11-30T13:56:30.566263739Z","level":"INFO","msg":"writer: Close: closed","stream_id":"j8dmy8fe"}
|
| 15 |
+
{"time":"2025-11-30T13:56:30.573657463Z","level":"INFO","msg":"sender: closed","stream_id":"j8dmy8fe"}
|
| 16 |
+
{"time":"2025-11-30T13:56:30.573732716Z","level":"INFO","msg":"stream: closed","id":"j8dmy8fe"}
|
wandb/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Configure stats pid to 3738330
|
| 3 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from /home/agrivas/.config/wandb/settings
|
| 4 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/settings
|
| 5 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug.log
|
| 7 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug-internal.log
|
| 8 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'compile': True, 'device': 'cuda', 'from_checkpoint': None, 'load_mtp_head_from_model': None, 'name': 'nanogpt', 'training': {'random_seed': 13, 'batch_size': 256, 'device_batch_size': 1, 'sequence_length': 8192, 'num_iterations': 900, 'learning_rate': 0.0003, 'use_scheduler': False, 'save_model': True, 'save_optimizer': True, 'save_model_every': 100, 'val_loss_every': 100, 'val_tokens': 4194304, 'expname': 'llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1'}, 'model': {'name': 'mtp', 'beta': 0.0, 'gamma': 1, 'kl_algorithm': 'full', 'kl_type': 'forward', 'model': {'_target_': 'mtp.models.mtp.MultiTokenLM', 'lm': '${lm.model}', 'circuit': '${circuit.model}', 'mt_head_kwargs': '${mt_head.hyperparameters}', 'init_from_lm_head': True, 'kl_type': '${model.kl_type}', 'kl_algorithm': '${model.kl_algorithm}', 'beta': 0, 'gamma': 0.9}}, 'circuit': {'name': 'btree', 'n_token': 8, 'n_component': 32, 'n_repetition': 1, 'model': {'_target_': 'mtp.models.circuits.CircuitModel', 'vocab_size': 268, 'n_token': 8, 'n_component': 32, 'n_repetition': 1, 'kind': 'btree'}}, 'mt_head': {'name': 'linear-evabyte', 'hyperparameters': {'type': 'evabyte', 'n_embd': 3072, 'transformer_n_head': 24, 'transformer_n_layer': 0, 'expander_type': 'linear', 'expander_n_layer': 1, 'freeze_vocab_unembedding': False, 'share_sum_weights': False, 'contextual_hmm_weights': True, 'init_hmm_identity': True}}, 'adaptor': {'name': 'none', 'hyperparameters': None}, 'lm': {'name': 'llama3-2-3b-byte', 'n_embd': 3072, 'n_head': 24, 'model': {'_target_': 'mtp.models.lm.LM', 'lm': None, 'encoder_only': True, 'from_checkpoint': None, 'from_huggingface': 'benjamin/Llama3-2-3B-IT-Byte', 'adaptor_kwargs': None, 'ref_enc': 'model', 'ref_head': 'lm_head', 'freeze': True}}, 'data': {'name': 'tulu3-llama3', 'train_bin': 'agrv/tulu-v3-sft-llama3-packed-seq-len-8192', 'val_bin': None, 'vocab_size': 268}, 'generate': {'speculative': False}, '_wandb': {}}
|
| 11 |
+
2025-11-28 15:19:48,445 INFO MainThread:3738330 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-11-28 15:19:48,445 INFO MainThread:3738330 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-11-28 15:19:48,469 INFO MainThread:3738330 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-11-28 15:19:48,469 INFO MainThread:3738330 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-11-28 15:19:48,474 INFO MainThread:3738330 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-11-28 15:19:48,496 INFO MainThread:3738330 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-11-28 15:19:49,156 INFO MainThread:3738330 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-11-28 15:19:49,910 INFO MainThread:3738330 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-11-28 15:19:49,910 INFO MainThread:3738330 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-11-28 15:19:49,915 INFO MainThread:3738330 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-11-28 15:19:49,915 INFO MainThread:3738330 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-11-28 15:19:49,940 INFO MainThread:3738330 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-11-30 13:56:28,347 INFO MsgRouterThr:3738330 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
|
wandb/run-20251128_151948-j8dmy8fe/files/config.yaml
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.8
|
| 4 |
+
m:
|
| 5 |
+
- "1": train/ce_loss_at_7
|
| 6 |
+
"5": 2
|
| 7 |
+
"6":
|
| 8 |
+
- 1
|
| 9 |
+
- 3
|
| 10 |
+
"7": []
|
| 11 |
+
- "1": global_step
|
| 12 |
+
"5": 2
|
| 13 |
+
"6":
|
| 14 |
+
- 1
|
| 15 |
+
- 3
|
| 16 |
+
"7": []
|
| 17 |
+
- "1": train/ce_loss_at_8
|
| 18 |
+
"5": 2
|
| 19 |
+
"6":
|
| 20 |
+
- 1
|
| 21 |
+
- 3
|
| 22 |
+
"7": []
|
| 23 |
+
- "1": train/ce_loss_at_4
|
| 24 |
+
"5": 2
|
| 25 |
+
"6":
|
| 26 |
+
- 1
|
| 27 |
+
- 3
|
| 28 |
+
"7": []
|
| 29 |
+
- "1": train/ce_loss_at_5
|
| 30 |
+
"5": 2
|
| 31 |
+
"6":
|
| 32 |
+
- 1
|
| 33 |
+
- 3
|
| 34 |
+
"7": []
|
| 35 |
+
- "1": train/ce_loss_at_6
|
| 36 |
+
"5": 2
|
| 37 |
+
"6":
|
| 38 |
+
- 1
|
| 39 |
+
- 3
|
| 40 |
+
"7": []
|
| 41 |
+
- "1": train/loss
|
| 42 |
+
"5": 2
|
| 43 |
+
"6":
|
| 44 |
+
- 1
|
| 45 |
+
- 3
|
| 46 |
+
"7": []
|
| 47 |
+
- "1": train/ce_loss_at_1
|
| 48 |
+
"5": 2
|
| 49 |
+
"6":
|
| 50 |
+
- 1
|
| 51 |
+
- 3
|
| 52 |
+
"7": []
|
| 53 |
+
- "1": train/ce_loss_at_2
|
| 54 |
+
"5": 2
|
| 55 |
+
"6":
|
| 56 |
+
- 1
|
| 57 |
+
- 3
|
| 58 |
+
"7": []
|
| 59 |
+
- "1": train/ce_loss_at_3
|
| 60 |
+
"5": 2
|
| 61 |
+
"6":
|
| 62 |
+
- 1
|
| 63 |
+
- 3
|
| 64 |
+
"7": []
|
| 65 |
+
python_version: 3.10.16
|
| 66 |
+
t:
|
| 67 |
+
"1":
|
| 68 |
+
- 1
|
| 69 |
+
- 11
|
| 70 |
+
- 41
|
| 71 |
+
- 49
|
| 72 |
+
- 50
|
| 73 |
+
- 51
|
| 74 |
+
- 55
|
| 75 |
+
- 71
|
| 76 |
+
- 84
|
| 77 |
+
- 98
|
| 78 |
+
"2":
|
| 79 |
+
- 1
|
| 80 |
+
- 11
|
| 81 |
+
- 41
|
| 82 |
+
- 49
|
| 83 |
+
- 50
|
| 84 |
+
- 51
|
| 85 |
+
- 55
|
| 86 |
+
- 71
|
| 87 |
+
- 84
|
| 88 |
+
- 98
|
| 89 |
+
"3":
|
| 90 |
+
- 7
|
| 91 |
+
- 13
|
| 92 |
+
- 15
|
| 93 |
+
- 16
|
| 94 |
+
- 23
|
| 95 |
+
- 55
|
| 96 |
+
"4": 3.10.16
|
| 97 |
+
"5": 0.19.8
|
| 98 |
+
"6": 4.49.0
|
| 99 |
+
"8":
|
| 100 |
+
- 5
|
| 101 |
+
"12": 0.19.8
|
| 102 |
+
"13": linux-x86_64
|
| 103 |
+
adaptor:
|
| 104 |
+
value:
|
| 105 |
+
hyperparameters: null
|
| 106 |
+
name: none
|
| 107 |
+
circuit:
|
| 108 |
+
value:
|
| 109 |
+
model:
|
| 110 |
+
_target_: mtp.models.circuits.CircuitModel
|
| 111 |
+
kind: btree
|
| 112 |
+
n_component: 32
|
| 113 |
+
n_repetition: 1
|
| 114 |
+
n_token: 8
|
| 115 |
+
vocab_size: 268
|
| 116 |
+
n_component: 32
|
| 117 |
+
n_repetition: 1
|
| 118 |
+
n_token: 8
|
| 119 |
+
name: btree
|
| 120 |
+
compile:
|
| 121 |
+
value: true
|
| 122 |
+
data:
|
| 123 |
+
value:
|
| 124 |
+
name: tulu3-llama3
|
| 125 |
+
train_bin: agrv/tulu-v3-sft-llama3-packed-seq-len-8192
|
| 126 |
+
val_bin: null
|
| 127 |
+
vocab_size: 268
|
| 128 |
+
device:
|
| 129 |
+
value: cuda
|
| 130 |
+
from_checkpoint:
|
| 131 |
+
value: null
|
| 132 |
+
generate:
|
| 133 |
+
value:
|
| 134 |
+
speculative: false
|
| 135 |
+
lm:
|
| 136 |
+
value:
|
| 137 |
+
model:
|
| 138 |
+
_target_: mtp.models.lm.LM
|
| 139 |
+
adaptor_kwargs: null
|
| 140 |
+
encoder_only: true
|
| 141 |
+
freeze: true
|
| 142 |
+
from_checkpoint: null
|
| 143 |
+
from_huggingface: benjamin/Llama3-2-3B-IT-Byte
|
| 144 |
+
lm: null
|
| 145 |
+
ref_enc: model
|
| 146 |
+
ref_head: lm_head
|
| 147 |
+
n_embd: 3072
|
| 148 |
+
n_head: 24
|
| 149 |
+
name: llama3-2-3b-byte
|
| 150 |
+
load_mtp_head_from_model:
|
| 151 |
+
value: null
|
| 152 |
+
model:
|
| 153 |
+
value:
|
| 154 |
+
beta: 0
|
| 155 |
+
gamma: 1
|
| 156 |
+
kl_algorithm: full
|
| 157 |
+
kl_type: forward
|
| 158 |
+
model:
|
| 159 |
+
_target_: mtp.models.mtp.MultiTokenLM
|
| 160 |
+
beta: 0
|
| 161 |
+
circuit: ${circuit.model}
|
| 162 |
+
gamma: 0.9
|
| 163 |
+
init_from_lm_head: true
|
| 164 |
+
kl_algorithm: ${model.kl_algorithm}
|
| 165 |
+
kl_type: ${model.kl_type}
|
| 166 |
+
lm: ${lm.model}
|
| 167 |
+
mt_head_kwargs: ${mt_head.hyperparameters}
|
| 168 |
+
name: mtp
|
| 169 |
+
mt_head:
|
| 170 |
+
value:
|
| 171 |
+
hyperparameters:
|
| 172 |
+
contextual_hmm_weights: true
|
| 173 |
+
expander_n_layer: 1
|
| 174 |
+
expander_type: linear
|
| 175 |
+
freeze_vocab_unembedding: false
|
| 176 |
+
init_hmm_identity: true
|
| 177 |
+
n_embd: 3072
|
| 178 |
+
share_sum_weights: false
|
| 179 |
+
transformer_n_head: 24
|
| 180 |
+
transformer_n_layer: 0
|
| 181 |
+
type: evabyte
|
| 182 |
+
name: linear-evabyte
|
| 183 |
+
name:
|
| 184 |
+
value: nanogpt
|
| 185 |
+
training:
|
| 186 |
+
value:
|
| 187 |
+
batch_size: 256
|
| 188 |
+
device_batch_size: 1
|
| 189 |
+
expname: llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1
|
| 190 |
+
learning_rate: 0.0003
|
| 191 |
+
num_iterations: 900
|
| 192 |
+
random_seed: 13
|
| 193 |
+
save_model: true
|
| 194 |
+
save_model_every: 100
|
| 195 |
+
save_optimizer: true
|
| 196 |
+
sequence_length: 8192
|
| 197 |
+
use_scheduler: false
|
| 198 |
+
val_loss_every: 100
|
| 199 |
+
val_tokens: 4194304
|
wandb/run-20251128_151948-j8dmy8fe/files/output.log
ADDED
|
@@ -0,0 +1,951 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-11-28 15:19:49,942] - Saving config and checkpoints to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34...
|
| 2 |
+
[2025-11-28 15:19:49,942] - Save model: True...
|
| 3 |
+
[2025-11-28 15:19:49,943] - Save optimizer: True...
|
| 4 |
+
[2025-11-28 15:19:49,950] - Training on agrv/tulu-v3-sft-llama3-packed-seq-len-8192...
|
| 5 |
+
Generating train split: 100%|██████████| 237482/237482 [00:08<00:00, 28424.55 examples/s]
|
| 6 |
+
Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
|
| 7 |
+
[2025-11-28 15:20:29,738] - Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
|
| 8 |
+
Generating valid split: 100%|██████████| 2399/2399 [00:00<00:00, 5296.71 examples/s]
|
| 9 |
+
[2025-11-28 15:20:31,881] - step:0/900 Saving model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@0.pt...
|
| 10 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 11 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 12 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] Graph break from `Tensor.item()`, consider setting:
|
| 13 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] torch._dynamo.config.capture_scalar_outputs = True
|
| 14 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] or:
|
| 15 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
|
| 16 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] to include these operations in the captured graph.
|
| 17 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0]
|
| 18 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] Graph break: from user code at:
|
| 19 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] File "/disk/scratch/agrivas/nanoGPT/mtp/models/mtp.py", line 212, in torch_dynamo_resume_in_forward_at_204
|
| 20 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] xxd = self.lm.encoder(
|
| 21 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] File "/disk/scratch/agrivas/nanoGPT/data/modules/transformers_modules/benjamin/Llama3-2-3B-IT-Byte/19d951e04213250d844131bce370ae9c752eb7e9/modelling_tpu_llama.py", line 939, in forward
|
| 22 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] expanded_input_ids = torch_expand_input_ids(
|
| 23 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] File "/disk/scratch/agrivas/nanoGPT/data/modules/transformers_modules/benjamin/Llama3-2-3B-IT-Byte/19d951e04213250d844131bce370ae9c752eb7e9/modelling_tpu_llama.py", line 71, in torch_expand_input_ids
|
| 24 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0] last_maxlen_ids.insert(0, int(input_ids[example_idx][i] + 1))
|
| 25 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0]
|
| 26 |
+
[rank0]:W1128 15:20:33.883000 3738330 torch/_dynamo/variables/tensor.py:869] [1/0]
|
| 27 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/torch/_inductor/compile_fx.py:194: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
|
| 28 |
+
warnings.warn(
|
| 29 |
+
[rank0]:W1128 15:21:06.801000 3738330 torch/_dynamo/exc.py:304] [11/0] Backend compiler failed with a fake tensor exception at
|
| 30 |
+
[rank0]:W1128 15:21:06.801000 3738330 torch/_dynamo/exc.py:304] [11/0] File "/disk/scratch/agrivas/nanoGPT/mtp/utils/packing.py", line 37, in torch_dynamo_resume_in_packed_targets_to_target_windows_at_32
|
| 31 |
+
[rank0]:W1128 15:21:06.801000 3738330 torch/_dynamo/exc.py:304] [11/0] return torch.concat(parts, dim=0).reshape(B, S, n)
|
| 32 |
+
[rank0]:W1128 15:21:06.801000 3738330 torch/_dynamo/exc.py:304] [11/0] Adding a graph break.
|
| 33 |
+
[rank0]:W1128 15:21:06.894000 3738330 torch/_dynamo/exc.py:304] [11/0_1] Backend compiler failed with a fake tensor exception at
|
| 34 |
+
[rank0]:W1128 15:21:06.894000 3738330 torch/_dynamo/exc.py:304] [11/0_1] File "/disk/scratch/agrivas/nanoGPT/mtp/utils/packing.py", line 37, in torch_dynamo_resume_in_packed_targets_to_target_windows_at_32
|
| 35 |
+
[rank0]:W1128 15:21:06.894000 3738330 torch/_dynamo/exc.py:304] [11/0_1] return torch.concat(parts, dim=0).reshape(B, S, n)
|
| 36 |
+
[rank0]:W1128 15:21:06.894000 3738330 torch/_dynamo/exc.py:304] [11/0_1] Adding a graph break.
|
| 37 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 38 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 39 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 40 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 41 |
+
[2025-11-28 15:24:47,442] - step:1/900 train_loss:4.6532 lr:0.0003000000 time/step:254.93s
|
| 42 |
+
[2025-11-28 15:27:51,454] - step:2/900 train_loss:4.2966 lr:0.0003000000 time/step:184.01s
|
| 43 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:90: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 44 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 45 |
+
[2025-11-28 15:30:57,835] - step:3/900 train_loss:3.9828 lr:0.0003000000 time/step:186.37s
|
| 46 |
+
[2025-11-28 15:34:08,017] - step:4/900 train_loss:3.6910 lr:0.0003000000 time/step:190.16s
|
| 47 |
+
[2025-11-28 15:37:15,451] - step:5/900 train_loss:3.4752 lr:0.0003000000 time/step:187.40s
|
| 48 |
+
[2025-11-28 15:40:23,150] - step:6/900 train_loss:3.3047 lr:0.0003000000 time/step:187.69s
|
| 49 |
+
[2025-11-28 15:43:30,606] - step:7/900 train_loss:3.1140 lr:0.0003000000 time/step:187.45s
|
| 50 |
+
[2025-11-28 15:46:34,854] - step:8/900 train_loss:2.9731 lr:0.0003000000 time/step:184.24s
|
| 51 |
+
[2025-11-28 15:49:37,899] - step:9/900 train_loss:2.8709 lr:0.0003000000 time/step:183.04s
|
| 52 |
+
[2025-11-28 15:52:42,161] - step:10/900 train_loss:2.7582 lr:0.0003000000 time/step:184.25s
|
| 53 |
+
[2025-11-28 15:55:49,659] - step:11/900 train_loss:2.6474 lr:0.0003000000 time/step:187.49s
|
| 54 |
+
[2025-11-28 15:58:56,730] - step:12/900 train_loss:2.5890 lr:0.0003000000 time/step:187.06s
|
| 55 |
+
[2025-11-28 16:02:04,893] - step:13/900 train_loss:2.5418 lr:0.0003000000 time/step:188.16s
|
| 56 |
+
[2025-11-28 16:05:11,642] - step:14/900 train_loss:2.4586 lr:0.0003000000 time/step:186.74s
|
| 57 |
+
[2025-11-28 16:08:18,599] - step:15/900 train_loss:2.3908 lr:0.0003000000 time/step:186.94s
|
| 58 |
+
[2025-11-28 16:11:24,210] - step:16/900 train_loss:2.3323 lr:0.0003000000 time/step:185.60s
|
| 59 |
+
[2025-11-28 16:14:28,421] - step:17/900 train_loss:2.2802 lr:0.0003000000 time/step:184.20s
|
| 60 |
+
[2025-11-28 16:17:31,315] - step:18/900 train_loss:2.2268 lr:0.0003000000 time/step:182.88s
|
| 61 |
+
[2025-11-28 16:20:33,212] - step:19/900 train_loss:2.2212 lr:0.0003000000 time/step:181.88s
|
| 62 |
+
[2025-11-28 16:23:39,339] - step:20/900 train_loss:2.1965 lr:0.0003000000 time/step:186.12s
|
| 63 |
+
[2025-11-28 16:26:46,366] - step:21/900 train_loss:2.1549 lr:0.0003000000 time/step:187.01s
|
| 64 |
+
[2025-11-28 16:29:53,452] - step:22/900 train_loss:2.0844 lr:0.0003000000 time/step:187.07s
|
| 65 |
+
[2025-11-28 16:33:01,636] - step:23/900 train_loss:2.0673 lr:0.0003000000 time/step:188.18s
|
| 66 |
+
[2025-11-28 16:36:09,091] - step:24/900 train_loss:2.0375 lr:0.0003000000 time/step:187.44s
|
| 67 |
+
[2025-11-28 16:39:16,657] - step:25/900 train_loss:2.0299 lr:0.0003000000 time/step:187.55s
|
| 68 |
+
[2025-11-28 16:42:25,561] - step:26/900 train_loss:1.9910 lr:0.0003000000 time/step:188.90s
|
| 69 |
+
[2025-11-28 16:45:33,283] - step:27/900 train_loss:1.9708 lr:0.0003000000 time/step:187.68s
|
| 70 |
+
[2025-11-28 16:48:40,284] - step:28/900 train_loss:1.9105 lr:0.0003000000 time/step:186.98s
|
| 71 |
+
[2025-11-28 16:51:47,205] - step:29/900 train_loss:1.9014 lr:0.0003000000 time/step:186.92s
|
| 72 |
+
[2025-11-28 16:54:56,592] - step:30/900 train_loss:1.8643 lr:0.0003000000 time/step:189.38s
|
| 73 |
+
[2025-11-28 16:58:04,452] - step:31/900 train_loss:1.8593 lr:0.0003000000 time/step:187.84s
|
| 74 |
+
[2025-11-28 17:01:11,681] - step:32/900 train_loss:1.8733 lr:0.0003000000 time/step:187.21s
|
| 75 |
+
[2025-11-28 17:04:19,862] - step:33/900 train_loss:1.7975 lr:0.0003000000 time/step:188.17s
|
| 76 |
+
[2025-11-28 17:07:27,610] - step:34/900 train_loss:1.8307 lr:0.0003000000 time/step:187.74s
|
| 77 |
+
[2025-11-28 17:10:35,249] - step:35/900 train_loss:1.8018 lr:0.0003000000 time/step:187.63s
|
| 78 |
+
[2025-11-28 17:13:46,950] - step:36/900 train_loss:1.8066 lr:0.0003000000 time/step:191.69s
|
| 79 |
+
[2025-11-28 17:16:53,853] - step:37/900 train_loss:1.7636 lr:0.0003000000 time/step:186.82s
|
| 80 |
+
[2025-11-28 17:20:00,571] - step:38/900 train_loss:1.7714 lr:0.0003000000 time/step:186.54s
|
| 81 |
+
[2025-11-28 17:23:13,248] - step:39/900 train_loss:1.7096 lr:0.0003000000 time/step:192.65s
|
| 82 |
+
[2025-11-28 17:26:19,575] - step:40/900 train_loss:1.7411 lr:0.0003000000 time/step:186.29s
|
| 83 |
+
[2025-11-28 17:29:25,904] - step:41/900 train_loss:1.6913 lr:0.0003000000 time/step:186.27s
|
| 84 |
+
[2025-11-28 17:32:44,977] - step:42/900 train_loss:1.7001 lr:0.0003000000 time/step:199.05s
|
| 85 |
+
[2025-11-28 17:35:51,243] - step:43/900 train_loss:1.6629 lr:0.0003000000 time/step:186.21s
|
| 86 |
+
[2025-11-28 17:38:57,339] - step:44/900 train_loss:1.6610 lr:0.0003000000 time/step:185.79s
|
| 87 |
+
[2025-11-28 17:42:05,062] - step:45/900 train_loss:1.6524 lr:0.0003000000 time/step:187.68s
|
| 88 |
+
[2025-11-28 17:45:20,648] - step:46/900 train_loss:1.6555 lr:0.0003000000 time/step:195.50s
|
| 89 |
+
[2025-11-28 17:48:26,366] - step:47/900 train_loss:1.6223 lr:0.0003000000 time/step:185.70s
|
| 90 |
+
[2025-11-28 17:51:34,666] - step:48/900 train_loss:1.6481 lr:0.0003000000 time/step:188.12s
|
| 91 |
+
[2025-11-28 17:54:51,245] - step:49/900 train_loss:1.6112 lr:0.0003000000 time/step:196.52s
|
| 92 |
+
[2025-11-28 17:57:57,507] - step:50/900 train_loss:1.6013 lr:0.0003000000 time/step:186.19s
|
| 93 |
+
[2025-11-28 18:01:05,674] - step:51/900 train_loss:1.5772 lr:0.0003000000 time/step:187.99s
|
| 94 |
+
[2025-11-28 18:04:21,278] - step:52/900 train_loss:1.5660 lr:0.0003000000 time/step:195.58s
|
| 95 |
+
[2025-11-28 18:07:27,447] - step:53/900 train_loss:1.5702 lr:0.0003000000 time/step:186.11s
|
| 96 |
+
[2025-11-28 18:10:33,793] - step:54/900 train_loss:1.5665 lr:0.0003000000 time/step:186.26s
|
| 97 |
+
[2025-11-28 18:13:53,962] - step:55/900 train_loss:1.5804 lr:0.0003000000 time/step:200.15s
|
| 98 |
+
[2025-11-28 18:17:00,673] - step:56/900 train_loss:1.5645 lr:0.0003000000 time/step:186.66s
|
| 99 |
+
[2025-11-28 18:20:06,961] - step:57/900 train_loss:1.5609 lr:0.0003000000 time/step:186.23s
|
| 100 |
+
[2025-11-28 18:23:24,919] - step:58/900 train_loss:1.5356 lr:0.0003000000 time/step:197.90s
|
| 101 |
+
[2025-11-28 18:26:31,137] - step:59/900 train_loss:1.5277 lr:0.0003000000 time/step:186.18s
|
| 102 |
+
[2025-11-28 18:29:37,442] - step:60/900 train_loss:1.5330 lr:0.0003000000 time/step:186.22s
|
| 103 |
+
[2025-11-28 18:32:45,572] - step:61/900 train_loss:1.5127 lr:0.0003000000 time/step:188.07s
|
| 104 |
+
[2025-11-28 18:36:01,349] - step:62/900 train_loss:1.5127 lr:0.0003000000 time/step:195.75s
|
| 105 |
+
[2025-11-28 18:39:08,044] - step:63/900 train_loss:1.5255 lr:0.0003000000 time/step:186.63s
|
| 106 |
+
[2025-11-28 18:42:16,514] - step:64/900 train_loss:1.4881 lr:0.0003000000 time/step:188.39s
|
| 107 |
+
[2025-11-28 18:45:32,575] - step:65/900 train_loss:1.4746 lr:0.0003000000 time/step:196.00s
|
| 108 |
+
[2025-11-28 18:48:39,543] - step:66/900 train_loss:1.5017 lr:0.0003000000 time/step:186.89s
|
| 109 |
+
[2025-11-28 18:51:47,768] - step:67/900 train_loss:1.4805 lr:0.0003000000 time/step:188.07s
|
| 110 |
+
[2025-11-28 18:55:03,564] - step:68/900 train_loss:1.4929 lr:0.0003000000 time/step:195.75s
|
| 111 |
+
[2025-11-28 18:58:10,293] - step:69/900 train_loss:1.4550 lr:0.0003000000 time/step:186.67s
|
| 112 |
+
[2025-11-28 19:01:16,800] - step:70/900 train_loss:1.4532 lr:0.0003000000 time/step:186.44s
|
| 113 |
+
[2025-11-28 19:04:26,999] - step:71/900 train_loss:1.4520 lr:0.0003000000 time/step:190.18s
|
| 114 |
+
[2025-11-28 19:07:33,259] - step:72/900 train_loss:1.4301 lr:0.0003000000 time/step:186.22s
|
| 115 |
+
[2025-11-28 19:10:39,471] - step:73/900 train_loss:1.4337 lr:0.0003000000 time/step:186.20s
|
| 116 |
+
[2025-11-28 19:13:47,822] - step:74/900 train_loss:1.4296 lr:0.0003000000 time/step:188.33s
|
| 117 |
+
[2025-11-28 19:16:53,884] - step:75/900 train_loss:1.4294 lr:0.0003000000 time/step:186.04s
|
| 118 |
+
[2025-11-28 19:19:59,845] - step:76/900 train_loss:1.4367 lr:0.0003000000 time/step:185.94s
|
| 119 |
+
[2025-11-28 19:23:05,617] - step:77/900 train_loss:1.4359 lr:0.0003000000 time/step:185.76s
|
| 120 |
+
[2025-11-28 19:26:13,471] - step:78/900 train_loss:1.3907 lr:0.0003000000 time/step:187.84s
|
| 121 |
+
[2025-11-28 19:29:19,325] - step:79/900 train_loss:1.4074 lr:0.0003000000 time/step:185.83s
|
| 122 |
+
[2025-11-28 19:32:24,915] - step:80/900 train_loss:1.3818 lr:0.0003000000 time/step:185.57s
|
| 123 |
+
[2025-11-28 19:35:32,821] - step:81/900 train_loss:1.3966 lr:0.0003000000 time/step:187.89s
|
| 124 |
+
[2025-11-28 19:38:38,468] - step:82/900 train_loss:1.3767 lr:0.0003000000 time/step:185.62s
|
| 125 |
+
[2025-11-28 19:41:44,296] - step:83/900 train_loss:1.3772 lr:0.0003000000 time/step:185.82s
|
| 126 |
+
[2025-11-28 19:44:52,361] - step:84/900 train_loss:1.3639 lr:0.0003000000 time/step:188.06s
|
| 127 |
+
[2025-11-28 19:47:59,370] - step:85/900 train_loss:1.3910 lr:0.0003000000 time/step:186.99s
|
| 128 |
+
[2025-11-28 19:51:05,447] - step:86/900 train_loss:1.4013 lr:0.0003000000 time/step:186.07s
|
| 129 |
+
[2025-11-28 19:54:13,032] - step:87/900 train_loss:1.3883 lr:0.0003000000 time/step:187.58s
|
| 130 |
+
[2025-11-28 19:57:19,138] - step:88/900 train_loss:1.3712 lr:0.0003000000 time/step:186.09s
|
| 131 |
+
[2025-11-28 20:00:25,142] - step:89/900 train_loss:1.3749 lr:0.0003000000 time/step:185.98s
|
| 132 |
+
[2025-11-28 20:03:30,825] - step:90/900 train_loss:1.3630 lr:0.0003000000 time/step:185.67s
|
| 133 |
+
[2025-11-28 20:06:38,585] - step:91/900 train_loss:1.3713 lr:0.0003000000 time/step:187.75s
|
| 134 |
+
[2025-11-28 20:09:44,867] - step:92/900 train_loss:1.3503 lr:0.0003000000 time/step:186.27s
|
| 135 |
+
[2025-11-28 20:12:50,830] - step:93/900 train_loss:1.3537 lr:0.0003000000 time/step:185.94s
|
| 136 |
+
[2025-11-28 20:15:58,624] - step:94/900 train_loss:1.3468 lr:0.0003000000 time/step:187.79s
|
| 137 |
+
[2025-11-28 20:19:04,543] - step:95/900 train_loss:1.3603 lr:0.0003000000 time/step:185.91s
|
| 138 |
+
[2025-11-28 20:22:10,848] - step:96/900 train_loss:1.3216 lr:0.0003000000 time/step:186.29s
|
| 139 |
+
[2025-11-28 20:25:17,756] - step:97/900 train_loss:1.3276 lr:0.0003000000 time/step:186.90s
|
| 140 |
+
[2025-11-28 20:28:22,895] - step:98/900 train_loss:1.3128 lr:0.0003000000 time/step:185.09s
|
| 141 |
+
[2025-11-28 20:31:28,093] - step:99/900 train_loss:1.3014 lr:0.0003000000 time/step:185.13s
|
| 142 |
+
[2025-11-28 20:34:37,788] - step:100/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@100.pt...
|
| 143 |
+
[2025-11-28 20:34:37,813] - step:100/900 train_loss:1.3411 lr:0.0003000000 time/step:187.79s
|
| 144 |
+
[2025-11-28 20:37:43,371] - step:101/900 train_loss:1.3414 lr:0.0003000000 time/step:185.55s
|
| 145 |
+
[2025-11-28 20:40:49,246] - step:102/900 train_loss:1.3098 lr:0.0003000000 time/step:185.84s
|
| 146 |
+
[2025-11-28 20:43:55,147] - step:103/900 train_loss:1.3077 lr:0.0003000000 time/step:185.90s
|
| 147 |
+
[2025-11-28 20:47:03,589] - step:104/900 train_loss:1.3283 lr:0.0003000000 time/step:188.43s
|
| 148 |
+
[2025-11-28 20:50:09,456] - step:105/900 train_loss:1.3107 lr:0.0003000000 time/step:185.85s
|
| 149 |
+
[2025-11-28 20:53:15,133] - step:106/900 train_loss:1.3116 lr:0.0003000000 time/step:185.65s
|
| 150 |
+
[2025-11-28 20:56:23,079] - step:107/900 train_loss:1.3076 lr:0.0003000000 time/step:187.94s
|
| 151 |
+
[2025-11-28 20:59:29,309] - step:108/900 train_loss:1.2576 lr:0.0003000000 time/step:186.19s
|
| 152 |
+
[2025-11-28 21:02:34,096] - step:109/900 train_loss:1.3163 lr:0.0003000000 time/step:184.77s
|
| 153 |
+
[2025-11-28 21:05:38,534] - step:110/900 train_loss:1.2836 lr:0.0003000000 time/step:184.43s
|
| 154 |
+
[2025-11-28 21:08:41,909] - step:111/900 train_loss:1.2887 lr:0.0003000000 time/step:183.34s
|
| 155 |
+
[2025-11-28 21:11:44,652] - step:112/900 train_loss:1.2900 lr:0.0003000000 time/step:182.72s
|
| 156 |
+
[2025-11-28 21:14:49,050] - step:113/900 train_loss:1.3032 lr:0.0003000000 time/step:184.39s
|
| 157 |
+
[2025-11-28 21:17:51,714] - step:114/900 train_loss:1.2715 lr:0.0003000000 time/step:182.65s
|
| 158 |
+
[2025-11-28 21:20:54,366] - step:115/900 train_loss:1.2553 lr:0.0003000000 time/step:182.64s
|
| 159 |
+
[2025-11-28 21:23:58,585] - step:116/900 train_loss:1.2608 lr:0.0003000000 time/step:184.21s
|
| 160 |
+
[2025-11-28 21:27:05,711] - step:117/900 train_loss:1.2750 lr:0.0003000000 time/step:187.12s
|
| 161 |
+
[2025-11-28 21:30:10,632] - step:118/900 train_loss:1.2610 lr:0.0003000000 time/step:184.91s
|
| 162 |
+
[2025-11-28 21:33:15,980] - step:119/900 train_loss:1.2728 lr:0.0003000000 time/step:185.32s
|
| 163 |
+
[2025-11-28 21:36:22,993] - step:120/900 train_loss:1.2367 lr:0.0003000000 time/step:187.01s
|
| 164 |
+
[2025-11-28 21:39:27,798] - step:121/900 train_loss:1.2436 lr:0.0003000000 time/step:184.79s
|
| 165 |
+
[2025-11-28 21:42:32,716] - step:122/900 train_loss:1.2680 lr:0.0003000000 time/step:184.90s
|
| 166 |
+
[2025-11-28 21:45:39,837] - step:123/900 train_loss:1.2459 lr:0.0003000000 time/step:187.11s
|
| 167 |
+
[2025-11-28 21:48:44,604] - step:124/900 train_loss:1.2356 lr:0.0003000000 time/step:184.76s
|
| 168 |
+
[2025-11-28 21:51:49,462] - step:125/900 train_loss:1.2116 lr:0.0003000000 time/step:184.84s
|
| 169 |
+
[2025-11-28 21:54:56,152] - step:126/900 train_loss:1.2271 lr:0.0003000000 time/step:186.68s
|
| 170 |
+
[2025-11-28 21:58:02,698] - step:127/900 train_loss:1.2747 lr:0.0003000000 time/step:186.53s
|
| 171 |
+
[2025-11-28 22:01:07,919] - step:128/900 train_loss:1.2662 lr:0.0003000000 time/step:185.21s
|
| 172 |
+
[2025-11-28 22:04:13,473] - step:129/900 train_loss:1.2508 lr:0.0003000000 time/step:185.54s
|
| 173 |
+
[2025-11-28 22:07:19,897] - step:130/900 train_loss:1.2417 lr:0.0003000000 time/step:186.41s
|
| 174 |
+
[2025-11-28 22:10:24,163] - step:131/900 train_loss:1.2469 lr:0.0003000000 time/step:184.26s
|
| 175 |
+
[2025-11-28 22:13:29,588] - step:132/900 train_loss:1.2212 lr:0.0003000000 time/step:185.42s
|
| 176 |
+
[2025-11-28 22:16:36,724] - step:133/900 train_loss:1.2154 lr:0.0003000000 time/step:187.11s
|
| 177 |
+
[2025-11-28 22:19:41,361] - step:134/900 train_loss:1.1905 lr:0.0003000000 time/step:184.62s
|
| 178 |
+
[2025-11-28 22:22:46,426] - step:135/900 train_loss:1.2090 lr:0.0003000000 time/step:185.04s
|
| 179 |
+
[2025-11-28 22:25:53,482] - step:136/900 train_loss:1.2180 lr:0.0003000000 time/step:187.04s
|
| 180 |
+
[2025-11-28 22:28:58,396] - step:137/900 train_loss:1.2309 lr:0.0003000000 time/step:184.90s
|
| 181 |
+
[2025-11-28 22:32:02,953] - step:138/900 train_loss:1.2127 lr:0.0003000000 time/step:184.53s
|
| 182 |
+
[2025-11-28 22:35:08,685] - step:139/900 train_loss:1.2126 lr:0.0003000000 time/step:185.71s
|
| 183 |
+
[2025-11-28 22:38:15,825] - step:140/900 train_loss:1.2117 lr:0.0003000000 time/step:187.09s
|
| 184 |
+
[2025-11-28 22:41:20,366] - step:141/900 train_loss:1.2301 lr:0.0003000000 time/step:184.53s
|
| 185 |
+
[2025-11-28 22:44:24,896] - step:142/900 train_loss:1.2388 lr:0.0003000000 time/step:184.52s
|
| 186 |
+
[2025-11-28 22:47:31,624] - step:143/900 train_loss:1.1987 lr:0.0003000000 time/step:186.71s
|
| 187 |
+
[2025-11-28 22:50:37,358] - step:144/900 train_loss:1.2210 lr:0.0003000000 time/step:185.73s
|
| 188 |
+
[2025-11-28 22:53:43,613] - step:145/900 train_loss:1.2170 lr:0.0003000000 time/step:186.22s
|
| 189 |
+
[2025-11-28 22:57:06,629] - step:146/900 train_loss:1.2236 lr:0.0003000000 time/step:203.01s
|
| 190 |
+
[2025-11-28 23:00:09,814] - step:147/900 train_loss:1.2255 lr:0.0003000000 time/step:183.18s
|
| 191 |
+
[2025-11-28 23:03:14,149] - step:148/900 train_loss:1.1806 lr:0.0003000000 time/step:184.31s
|
| 192 |
+
[2025-11-28 23:06:23,397] - step:149/900 train_loss:1.2233 lr:0.0003000000 time/step:189.23s
|
| 193 |
+
[2025-11-28 23:09:30,162] - step:150/900 train_loss:1.1677 lr:0.0003000000 time/step:186.75s
|
| 194 |
+
[2025-11-28 23:12:34,786] - step:151/900 train_loss:1.2155 lr:0.0003000000 time/step:184.59s
|
| 195 |
+
[2025-11-28 23:15:41,431] - step:152/900 train_loss:1.1948 lr:0.0003000000 time/step:186.63s
|
| 196 |
+
[2025-11-28 23:18:47,806] - step:153/900 train_loss:1.1950 lr:0.0003000000 time/step:186.35s
|
| 197 |
+
[2025-11-28 23:21:52,115] - step:154/900 train_loss:1.2133 lr:0.0003000000 time/step:184.28s
|
| 198 |
+
[2025-11-28 23:24:56,981] - step:155/900 train_loss:1.1862 lr:0.0003000000 time/step:184.85s
|
| 199 |
+
[2025-11-28 23:28:03,290] - step:156/900 train_loss:1.1699 lr:0.0003000000 time/step:186.29s
|
| 200 |
+
[2025-11-28 23:31:07,306] - step:157/900 train_loss:1.1773 lr:0.0003000000 time/step:184.00s
|
| 201 |
+
[2025-11-28 23:34:12,414] - step:158/900 train_loss:1.1680 lr:0.0003000000 time/step:185.10s
|
| 202 |
+
[2025-11-28 23:37:19,900] - step:159/900 train_loss:1.1806 lr:0.0003000000 time/step:187.45s
|
| 203 |
+
[2025-11-28 23:40:24,615] - step:160/900 train_loss:1.1865 lr:0.0003000000 time/step:184.70s
|
| 204 |
+
[2025-11-28 23:43:29,245] - step:161/900 train_loss:1.1872 lr:0.0003000000 time/step:184.61s
|
| 205 |
+
[2025-11-28 23:46:36,811] - step:162/900 train_loss:1.1806 lr:0.0003000000 time/step:187.56s
|
| 206 |
+
[2025-11-28 23:49:41,637] - step:163/900 train_loss:1.1750 lr:0.0003000000 time/step:184.79s
|
| 207 |
+
[2025-11-28 23:52:45,829] - step:164/900 train_loss:1.1828 lr:0.0003000000 time/step:184.16s
|
| 208 |
+
[2025-11-28 23:55:50,721] - step:165/900 train_loss:1.1742 lr:0.0003000000 time/step:184.88s
|
| 209 |
+
[2025-11-28 23:58:57,667] - step:166/900 train_loss:1.1655 lr:0.0003000000 time/step:186.93s
|
| 210 |
+
[2025-11-29 00:02:02,656] - step:167/900 train_loss:1.1631 lr:0.0003000000 time/step:184.97s
|
| 211 |
+
[2025-11-29 00:05:08,306] - step:168/900 train_loss:1.1614 lr:0.0003000000 time/step:185.63s
|
| 212 |
+
[2025-11-29 00:08:15,208] - step:169/900 train_loss:1.1613 lr:0.0003000000 time/step:186.89s
|
| 213 |
+
[2025-11-29 00:11:19,829] - step:170/900 train_loss:1.1623 lr:0.0003000000 time/step:184.60s
|
| 214 |
+
[2025-11-29 00:14:25,137] - step:171/900 train_loss:1.1538 lr:0.0003000000 time/step:185.30s
|
| 215 |
+
[2025-11-29 00:17:32,364] - step:172/900 train_loss:1.1782 lr:0.0003000000 time/step:187.22s
|
| 216 |
+
[2025-11-29 00:20:37,216] - step:173/900 train_loss:1.1596 lr:0.0003000000 time/step:184.84s
|
| 217 |
+
[2025-11-29 00:23:42,361] - step:174/900 train_loss:1.1381 lr:0.0003000000 time/step:185.12s
|
| 218 |
+
[2025-11-29 00:26:49,327] - step:175/900 train_loss:1.1305 lr:0.0003000000 time/step:186.96s
|
| 219 |
+
[2025-11-29 00:29:54,460] - step:176/900 train_loss:1.1603 lr:0.0003000000 time/step:185.12s
|
| 220 |
+
[2025-11-29 00:32:59,491] - step:177/900 train_loss:1.1435 lr:0.0003000000 time/step:185.01s
|
| 221 |
+
[2025-11-29 00:36:04,756] - step:178/900 train_loss:1.1653 lr:0.0003000000 time/step:185.25s
|
| 222 |
+
[2025-11-29 00:39:11,804] - step:179/900 train_loss:1.1443 lr:0.0003000000 time/step:187.04s
|
| 223 |
+
[2025-11-29 00:42:16,834] - step:180/900 train_loss:1.1554 lr:0.0003000000 time/step:185.01s
|
| 224 |
+
[2025-11-29 00:45:22,795] - step:181/900 train_loss:1.1495 lr:0.0003000000 time/step:185.95s
|
| 225 |
+
[2025-11-29 00:48:30,739] - step:182/900 train_loss:1.1251 lr:0.0003000000 time/step:187.94s
|
| 226 |
+
[2025-11-29 00:51:34,795] - step:183/900 train_loss:1.1323 lr:0.0003000000 time/step:184.04s
|
| 227 |
+
[2025-11-29 00:54:39,599] - step:184/900 train_loss:1.1293 lr:0.0003000000 time/step:184.80s
|
| 228 |
+
[2025-11-29 00:57:45,600] - step:185/900 train_loss:1.1500 lr:0.0003000000 time/step:185.99s
|
| 229 |
+
[2025-11-29 01:00:49,413] - step:186/900 train_loss:1.1429 lr:0.0003000000 time/step:183.79s
|
| 230 |
+
[2025-11-29 01:03:54,362] - step:187/900 train_loss:1.1384 lr:0.0003000000 time/step:184.93s
|
| 231 |
+
[2025-11-29 01:07:01,673] - step:188/900 train_loss:1.1665 lr:0.0003000000 time/step:187.31s
|
| 232 |
+
[2025-11-29 01:10:06,793] - step:189/900 train_loss:1.1470 lr:0.0003000000 time/step:185.10s
|
| 233 |
+
[2025-11-29 01:13:11,822] - step:190/900 train_loss:1.1562 lr:0.0003000000 time/step:185.00s
|
| 234 |
+
[2025-11-29 01:16:16,209] - step:191/900 train_loss:1.1811 lr:0.0003000000 time/step:184.37s
|
| 235 |
+
[2025-11-29 01:19:22,340] - step:192/900 train_loss:1.1471 lr:0.0003000000 time/step:186.13s
|
| 236 |
+
[2025-11-29 01:22:26,519] - step:193/900 train_loss:1.1428 lr:0.0003000000 time/step:184.15s
|
| 237 |
+
[2025-11-29 01:25:31,429] - step:194/900 train_loss:1.1208 lr:0.0003000000 time/step:184.89s
|
| 238 |
+
[2025-11-29 01:28:36,974] - step:195/900 train_loss:1.1308 lr:0.0003000000 time/step:185.54s
|
| 239 |
+
[2025-11-29 01:31:40,544] - step:196/900 train_loss:1.1228 lr:0.0003000000 time/step:183.54s
|
| 240 |
+
[2025-11-29 01:34:45,938] - step:197/900 train_loss:1.1161 lr:0.0003000000 time/step:185.38s
|
| 241 |
+
[2025-11-29 01:37:53,156] - step:198/900 train_loss:1.1478 lr:0.0003000000 time/step:187.21s
|
| 242 |
+
[2025-11-29 01:40:58,171] - step:199/900 train_loss:1.1103 lr:0.0003000000 time/step:184.99s
|
| 243 |
+
[2025-11-29 01:44:05,489] - step:200/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@200.pt...
|
| 244 |
+
[2025-11-29 01:44:05,525] - step:200/900 train_loss:1.1274 lr:0.0003000000 time/step:185.55s
|
| 245 |
+
[2025-11-29 01:47:14,488] - step:201/900 train_loss:1.1234 lr:0.0003000000 time/step:188.94s
|
| 246 |
+
[2025-11-29 01:50:21,411] - step:202/900 train_loss:1.1199 lr:0.0003000000 time/step:186.91s
|
| 247 |
+
[2025-11-29 01:53:26,628] - step:203/900 train_loss:1.0972 lr:0.0003000000 time/step:185.20s
|
| 248 |
+
[2025-11-29 01:56:32,570] - step:204/900 train_loss:1.1371 lr:0.0003000000 time/step:185.92s
|
| 249 |
+
[2025-11-29 01:59:41,140] - step:205/900 train_loss:1.1408 lr:0.0003000000 time/step:188.56s
|
| 250 |
+
[2025-11-29 02:02:45,379] - step:206/900 train_loss:1.0997 lr:0.0003000000 time/step:184.22s
|
| 251 |
+
[2025-11-29 02:05:50,066] - step:207/900 train_loss:1.1332 lr:0.0003000000 time/step:184.67s
|
| 252 |
+
[2025-11-29 02:08:56,311] - step:208/900 train_loss:1.1209 lr:0.0003000000 time/step:186.24s
|
| 253 |
+
[2025-11-29 02:12:00,567] - step:209/900 train_loss:1.0919 lr:0.0003000000 time/step:184.22s
|
| 254 |
+
[2025-11-29 02:15:04,792] - step:210/900 train_loss:1.1005 lr:0.0003000000 time/step:184.22s
|
| 255 |
+
[2025-11-29 02:18:10,654] - step:211/900 train_loss:1.1036 lr:0.0003000000 time/step:185.86s
|
| 256 |
+
[2025-11-29 02:21:14,585] - step:212/900 train_loss:1.1229 lr:0.0003000000 time/step:183.92s
|
| 257 |
+
[2025-11-29 02:24:19,368] - step:213/900 train_loss:1.1051 lr:0.0003000000 time/step:184.77s
|
| 258 |
+
[2025-11-29 02:27:26,145] - step:214/900 train_loss:1.1085 lr:0.0003000000 time/step:186.77s
|
| 259 |
+
[2025-11-29 02:30:30,712] - step:215/900 train_loss:1.0930 lr:0.0003000000 time/step:184.56s
|
| 260 |
+
[2025-11-29 02:33:34,774] - step:216/900 train_loss:1.0977 lr:0.0003000000 time/step:184.05s
|
| 261 |
+
[2025-11-29 02:36:40,292] - step:217/900 train_loss:1.1187 lr:0.0003000000 time/step:185.51s
|
| 262 |
+
[2025-11-29 02:39:49,043] - step:218/900 train_loss:1.0909 lr:0.0003000000 time/step:188.73s
|
| 263 |
+
[2025-11-29 02:42:54,991] - step:219/900 train_loss:1.1056 lr:0.0003000000 time/step:185.90s
|
| 264 |
+
[2025-11-29 02:46:00,394] - step:220/900 train_loss:1.1048 lr:0.0003000000 time/step:185.40s
|
| 265 |
+
[2025-11-29 02:49:07,579] - step:221/900 train_loss:1.1078 lr:0.0003000000 time/step:187.17s
|
| 266 |
+
[2025-11-29 02:52:12,146] - step:222/900 train_loss:1.1114 lr:0.0003000000 time/step:184.54s
|
| 267 |
+
[2025-11-29 02:55:16,480] - step:223/900 train_loss:1.1062 lr:0.0003000000 time/step:184.32s
|
| 268 |
+
[2025-11-29 02:58:22,768] - step:224/900 train_loss:1.1142 lr:0.0003000000 time/step:186.28s
|
| 269 |
+
[2025-11-29 03:01:26,953] - step:225/900 train_loss:1.0961 lr:0.0003000000 time/step:184.17s
|
| 270 |
+
[2025-11-29 03:04:31,749] - step:226/900 train_loss:1.0917 lr:0.0003000000 time/step:184.78s
|
| 271 |
+
[2025-11-29 03:07:38,529] - step:227/900 train_loss:1.0934 lr:0.0003000000 time/step:186.77s
|
| 272 |
+
[2025-11-29 03:10:43,271] - step:228/900 train_loss:1.1069 lr:0.0003000000 time/step:184.70s
|
| 273 |
+
[2025-11-29 03:13:48,167] - step:229/900 train_loss:1.0734 lr:0.0003000000 time/step:184.88s
|
| 274 |
+
[2025-11-29 03:16:52,812] - step:230/900 train_loss:1.0957 lr:0.0003000000 time/step:184.63s
|
| 275 |
+
[2025-11-29 03:19:58,801] - step:231/900 train_loss:1.0775 lr:0.0003000000 time/step:185.98s
|
| 276 |
+
[2025-11-29 03:23:02,987] - step:232/900 train_loss:1.0926 lr:0.0003000000 time/step:184.16s
|
| 277 |
+
[2025-11-29 03:26:08,180] - step:233/900 train_loss:1.1314 lr:0.0003000000 time/step:185.19s
|
| 278 |
+
[2025-11-29 03:29:14,462] - step:234/900 train_loss:1.0868 lr:0.0003000000 time/step:186.28s
|
| 279 |
+
[2025-11-29 03:32:19,081] - step:235/900 train_loss:1.0808 lr:0.0003000000 time/step:184.59s
|
| 280 |
+
[2025-11-29 03:35:24,243] - step:236/900 train_loss:1.0749 lr:0.0003000000 time/step:185.16s
|
| 281 |
+
[2025-11-29 03:38:31,254] - step:237/900 train_loss:1.1269 lr:0.0003000000 time/step:187.01s
|
| 282 |
+
[2025-11-29 03:41:35,966] - step:238/900 train_loss:1.0924 lr:0.0003000000 time/step:184.69s
|
| 283 |
+
[2025-11-29 03:44:41,260] - step:239/900 train_loss:1.0906 lr:0.0003000000 time/step:185.27s
|
| 284 |
+
[2025-11-29 03:47:49,206] - step:240/900 train_loss:1.0918 lr:0.0003000000 time/step:187.94s
|
| 285 |
+
[2025-11-29 03:50:54,694] - step:241/900 train_loss:1.0946 lr:0.0003000000 time/step:185.46s
|
| 286 |
+
[2025-11-29 03:53:59,535] - step:242/900 train_loss:1.1074 lr:0.0003000000 time/step:184.80s
|
| 287 |
+
[2025-11-29 03:57:04,220] - step:243/900 train_loss:1.0943 lr:0.0003000000 time/step:184.67s
|
| 288 |
+
[2025-11-29 04:00:10,432] - step:244/900 train_loss:1.0711 lr:0.0003000000 time/step:186.21s
|
| 289 |
+
[2025-11-29 04:03:15,729] - step:245/900 train_loss:1.1061 lr:0.0003000000 time/step:185.26s
|
| 290 |
+
[2025-11-29 04:06:20,984] - step:246/900 train_loss:1.0789 lr:0.0003000000 time/step:185.24s
|
| 291 |
+
[2025-11-29 04:09:27,749] - step:247/900 train_loss:1.0778 lr:0.0003000000 time/step:186.76s
|
| 292 |
+
[2025-11-29 04:12:33,149] - step:248/900 train_loss:1.0830 lr:0.0003000000 time/step:185.36s
|
| 293 |
+
[2025-11-29 04:15:37,995] - step:249/900 train_loss:1.0921 lr:0.0003000000 time/step:184.84s
|
| 294 |
+
[2025-11-29 04:18:44,391] - step:250/900 train_loss:1.0980 lr:0.0003000000 time/step:186.39s
|
| 295 |
+
[2025-11-29 04:21:49,000] - step:251/900 train_loss:1.0761 lr:0.0003000000 time/step:184.59s
|
| 296 |
+
[2025-11-29 04:24:54,274] - step:252/900 train_loss:1.0901 lr:0.0003000000 time/step:185.25s
|
| 297 |
+
[2025-11-29 04:28:02,058] - step:253/900 train_loss:1.0735 lr:0.0003000000 time/step:187.78s
|
| 298 |
+
[2025-11-29 04:31:09,878] - step:254/900 train_loss:1.0600 lr:0.0003000000 time/step:187.80s
|
| 299 |
+
[2025-11-29 04:34:17,142] - step:255/900 train_loss:1.0544 lr:0.0003000000 time/step:187.23s
|
| 300 |
+
[2025-11-29 04:37:23,181] - step:256/900 train_loss:1.0961 lr:0.0003000000 time/step:186.03s
|
| 301 |
+
[2025-11-29 04:40:31,175] - step:257/900 train_loss:1.0838 lr:0.0003000000 time/step:187.99s
|
| 302 |
+
[2025-11-29 04:43:37,155] - step:258/900 train_loss:1.1142 lr:0.0003000000 time/step:185.74s
|
| 303 |
+
[2025-11-29 04:46:41,531] - step:259/900 train_loss:1.0784 lr:0.0003000000 time/step:184.36s
|
| 304 |
+
[2025-11-29 04:49:47,139] - step:260/900 train_loss:1.0548 lr:0.0003000000 time/step:185.61s
|
| 305 |
+
[2025-11-29 04:52:51,373] - step:261/900 train_loss:1.0670 lr:0.0003000000 time/step:184.18s
|
| 306 |
+
[2025-11-29 04:55:56,540] - step:262/900 train_loss:1.0790 lr:0.0003000000 time/step:185.16s
|
| 307 |
+
[2025-11-29 04:59:03,662] - step:263/900 train_loss:1.0758 lr:0.0003000000 time/step:187.12s
|
| 308 |
+
[2025-11-29 05:02:08,811] - step:264/900 train_loss:1.0945 lr:0.0003000000 time/step:185.14s
|
| 309 |
+
[2025-11-29 05:05:13,852] - step:265/900 train_loss:1.0733 lr:0.0003000000 time/step:185.03s
|
| 310 |
+
[2025-11-29 05:08:20,825] - step:266/900 train_loss:1.0854 lr:0.0003000000 time/step:186.97s
|
| 311 |
+
[2025-11-29 05:11:25,639] - step:267/900 train_loss:1.0816 lr:0.0003000000 time/step:184.80s
|
| 312 |
+
[2025-11-29 05:14:31,022] - step:268/900 train_loss:1.0670 lr:0.0003000000 time/step:185.35s
|
| 313 |
+
[2025-11-29 05:17:35,585] - step:269/900 train_loss:1.0892 lr:0.0003000000 time/step:184.33s
|
| 314 |
+
[2025-11-29 05:20:42,015] - step:270/900 train_loss:1.0245 lr:0.0003000000 time/step:186.43s
|
| 315 |
+
[2025-11-29 05:23:46,422] - step:271/900 train_loss:1.0735 lr:0.0003000000 time/step:184.37s
|
| 316 |
+
[2025-11-29 05:26:50,452] - step:272/900 train_loss:1.0714 lr:0.0003000000 time/step:184.01s
|
| 317 |
+
[2025-11-29 05:29:56,149] - step:273/900 train_loss:1.0769 lr:0.0003000000 time/step:185.68s
|
| 318 |
+
[2025-11-29 05:32:59,582] - step:274/900 train_loss:1.0265 lr:0.0003000000 time/step:183.40s
|
| 319 |
+
[2025-11-29 05:36:04,909] - step:275/900 train_loss:1.0510 lr:0.0003000000 time/step:185.31s
|
| 320 |
+
[2025-11-29 05:39:12,005] - step:276/900 train_loss:1.0753 lr:0.0003000000 time/step:187.07s
|
| 321 |
+
[2025-11-29 05:42:16,993] - step:277/900 train_loss:1.0582 lr:0.0003000000 time/step:184.93s
|
| 322 |
+
[2025-11-29 05:45:22,003] - step:278/900 train_loss:1.0717 lr:0.0003000000 time/step:185.00s
|
| 323 |
+
[2025-11-29 05:48:28,179] - step:279/900 train_loss:1.0676 lr:0.0003000000 time/step:186.16s
|
| 324 |
+
[2025-11-29 05:51:33,621] - step:280/900 train_loss:1.0595 lr:0.0003000000 time/step:185.43s
|
| 325 |
+
[2025-11-29 05:54:38,325] - step:281/900 train_loss:1.0585 lr:0.0003000000 time/step:184.68s
|
| 326 |
+
[2025-11-29 05:57:43,757] - step:282/900 train_loss:1.0949 lr:0.0003000000 time/step:185.43s
|
| 327 |
+
[2025-11-29 06:00:50,769] - step:283/900 train_loss:1.0682 lr:0.0003000000 time/step:187.01s
|
| 328 |
+
[2025-11-29 06:03:55,483] - step:284/900 train_loss:1.0756 lr:0.0003000000 time/step:184.69s
|
| 329 |
+
[2025-11-29 06:07:00,263] - step:285/900 train_loss:1.0693 lr:0.0003000000 time/step:184.77s
|
| 330 |
+
[2025-11-29 06:10:07,073] - step:286/900 train_loss:1.0734 lr:0.0003000000 time/step:186.81s
|
| 331 |
+
[2025-11-29 06:13:12,527] - step:287/900 train_loss:1.0729 lr:0.0003000000 time/step:185.42s
|
| 332 |
+
[2025-11-29 06:16:17,678] - step:288/900 train_loss:1.0483 lr:0.0003000000 time/step:185.12s
|
| 333 |
+
[2025-11-29 06:19:24,289] - step:289/900 train_loss:1.0590 lr:0.0003000000 time/step:186.60s
|
| 334 |
+
[2025-11-29 06:22:30,122] - step:290/900 train_loss:1.0687 lr:0.0003000000 time/step:185.81s
|
| 335 |
+
[2025-11-29 06:25:35,642] - step:291/900 train_loss:1.0612 lr:0.0003000000 time/step:185.50s
|
| 336 |
+
[2025-11-29 06:28:42,491] - step:292/900 train_loss:1.0357 lr:0.0003000000 time/step:186.85s
|
| 337 |
+
[2025-11-29 06:31:49,725] - step:293/900 train_loss:1.0708 lr:0.0003000000 time/step:187.22s
|
| 338 |
+
[2025-11-29 06:34:55,796] - step:294/900 train_loss:1.0707 lr:0.0003000000 time/step:186.05s
|
| 339 |
+
[2025-11-29 06:38:00,778] - step:295/900 train_loss:1.0776 lr:0.0003000000 time/step:184.98s
|
| 340 |
+
[2025-11-29 06:41:07,189] - step:296/900 train_loss:1.0576 lr:0.0003000000 time/step:186.41s
|
| 341 |
+
[2025-11-29 06:44:11,733] - step:297/900 train_loss:1.0260 lr:0.0003000000 time/step:184.49s
|
| 342 |
+
[2025-11-29 06:47:15,871] - step:298/900 train_loss:1.0749 lr:0.0003000000 time/step:184.12s
|
| 343 |
+
[2025-11-29 06:50:21,808] - step:299/900 train_loss:1.0567 lr:0.0003000000 time/step:185.93s
|
| 344 |
+
[2025-11-29 06:53:27,979] - step:300/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@300.pt...
|
| 345 |
+
[2025-11-29 06:53:27,992] - step:300/900 train_loss:1.0667 lr:0.0003000000 time/step:184.41s
|
| 346 |
+
[2025-11-29 06:56:33,223] - step:301/900 train_loss:1.0596 lr:0.0003000000 time/step:185.22s
|
| 347 |
+
[2025-11-29 06:59:39,570] - step:302/900 train_loss:1.0485 lr:0.0003000000 time/step:186.34s
|
| 348 |
+
[2025-11-29 07:02:44,495] - step:303/900 train_loss:1.0444 lr:0.0003000000 time/step:184.92s
|
| 349 |
+
[2025-11-29 07:05:48,939] - step:304/900 train_loss:1.0787 lr:0.0003000000 time/step:184.42s
|
| 350 |
+
[2025-11-29 07:08:55,065] - step:305/900 train_loss:1.0548 lr:0.0003000000 time/step:186.12s
|
| 351 |
+
[2025-11-29 07:12:03,934] - step:306/900 train_loss:1.0604 lr:0.0003000000 time/step:188.86s
|
| 352 |
+
[2025-11-29 07:15:10,343] - step:307/900 train_loss:1.0368 lr:0.0003000000 time/step:186.37s
|
| 353 |
+
[2025-11-29 07:18:15,509] - step:308/900 train_loss:1.0500 lr:0.0003000000 time/step:185.15s
|
| 354 |
+
[2025-11-29 07:21:22,280] - step:309/900 train_loss:1.0519 lr:0.0003000000 time/step:186.76s
|
| 355 |
+
[2025-11-29 07:24:27,690] - step:310/900 train_loss:1.0396 lr:0.0003000000 time/step:185.39s
|
| 356 |
+
[2025-11-29 07:27:32,558] - step:311/900 train_loss:1.0199 lr:0.0003000000 time/step:184.86s
|
| 357 |
+
[2025-11-29 07:30:39,398] - step:312/900 train_loss:1.0318 lr:0.0003000000 time/step:186.83s
|
| 358 |
+
[2025-11-29 07:33:43,716] - step:313/900 train_loss:1.0245 lr:0.0003000000 time/step:184.27s
|
| 359 |
+
[2025-11-29 07:36:48,934] - step:314/900 train_loss:1.0550 lr:0.0003000000 time/step:185.21s
|
| 360 |
+
[2025-11-29 07:39:55,818] - step:315/900 train_loss:1.0384 lr:0.0003000000 time/step:186.88s
|
| 361 |
+
[2025-11-29 07:43:00,630] - step:316/900 train_loss:1.0352 lr:0.0003000000 time/step:184.79s
|
| 362 |
+
[2025-11-29 07:46:05,765] - step:317/900 train_loss:1.0406 lr:0.0003000000 time/step:185.12s
|
| 363 |
+
[2025-11-29 07:49:10,421] - step:318/900 train_loss:1.0438 lr:0.0003000000 time/step:184.64s
|
| 364 |
+
[2025-11-29 07:52:16,687] - step:319/900 train_loss:1.0463 lr:0.0003000000 time/step:186.26s
|
| 365 |
+
[2025-11-29 07:55:21,390] - step:320/900 train_loss:1.0608 lr:0.0003000000 time/step:184.68s
|
| 366 |
+
[2025-11-29 07:58:26,121] - step:321/900 train_loss:1.0704 lr:0.0003000000 time/step:184.70s
|
| 367 |
+
[2025-11-29 08:01:33,062] - step:322/900 train_loss:1.0459 lr:0.0003000000 time/step:186.94s
|
| 368 |
+
[2025-11-29 08:04:39,120] - step:323/900 train_loss:1.0463 lr:0.0003000000 time/step:185.86s
|
| 369 |
+
[2025-11-29 08:07:44,864] - step:324/900 train_loss:1.0497 lr:0.0003000000 time/step:185.73s
|
| 370 |
+
[2025-11-29 08:10:51,704] - step:325/900 train_loss:1.0295 lr:0.0003000000 time/step:186.82s
|
| 371 |
+
[2025-11-29 08:13:56,466] - step:326/900 train_loss:1.0555 lr:0.0003000000 time/step:184.73s
|
| 372 |
+
[2025-11-29 08:17:01,957] - step:327/900 train_loss:1.0380 lr:0.0003000000 time/step:185.49s
|
| 373 |
+
[2025-11-29 08:20:08,723] - step:328/900 train_loss:1.0256 lr:0.0003000000 time/step:186.75s
|
| 374 |
+
[2025-11-29 08:23:14,378] - step:329/900 train_loss:1.0418 lr:0.0003000000 time/step:185.64s
|
| 375 |
+
[2025-11-29 08:26:20,417] - step:330/900 train_loss:1.0660 lr:0.0003000000 time/step:186.01s
|
| 376 |
+
[2025-11-29 08:29:26,557] - step:331/900 train_loss:1.0481 lr:0.0003000000 time/step:186.12s
|
| 377 |
+
[2025-11-29 08:32:43,853] - step:332/900 train_loss:1.0370 lr:0.0003000000 time/step:197.25s
|
| 378 |
+
[2025-11-29 08:35:47,902] - step:333/900 train_loss:1.0556 lr:0.0003000000 time/step:184.02s
|
| 379 |
+
[2025-11-29 08:38:52,834] - step:334/900 train_loss:1.0512 lr:0.0003000000 time/step:184.93s
|
| 380 |
+
[2025-11-29 08:41:58,814] - step:335/900 train_loss:1.0432 lr:0.0003000000 time/step:185.95s
|
| 381 |
+
[2025-11-29 08:45:02,340] - step:336/900 train_loss:1.0165 lr:0.0003000000 time/step:183.51s
|
| 382 |
+
[2025-11-29 08:48:06,475] - step:337/900 train_loss:1.0600 lr:0.0003000000 time/step:184.12s
|
| 383 |
+
[2025-11-29 08:51:14,075] - step:338/900 train_loss:1.0304 lr:0.0003000000 time/step:187.60s
|
| 384 |
+
[2025-11-29 08:54:18,478] - step:339/900 train_loss:1.0187 lr:0.0003000000 time/step:184.37s
|
| 385 |
+
[2025-11-29 08:57:23,665] - step:340/900 train_loss:1.0326 lr:0.0003000000 time/step:185.18s
|
| 386 |
+
[2025-11-29 09:00:30,230] - step:341/900 train_loss:1.0415 lr:0.0003000000 time/step:186.56s
|
| 387 |
+
[2025-11-29 09:03:35,007] - step:342/900 train_loss:1.0413 lr:0.0003000000 time/step:184.75s
|
| 388 |
+
[2025-11-29 09:06:39,120] - step:343/900 train_loss:1.0377 lr:0.0003000000 time/step:184.10s
|
| 389 |
+
[2025-11-29 09:09:43,682] - step:344/900 train_loss:1.0266 lr:0.0003000000 time/step:184.56s
|
| 390 |
+
[2025-11-29 09:12:50,738] - step:345/900 train_loss:1.0305 lr:0.0003000000 time/step:187.04s
|
| 391 |
+
[2025-11-29 09:15:54,975] - step:346/900 train_loss:1.0238 lr:0.0003000000 time/step:184.22s
|
| 392 |
+
[2025-11-29 09:18:59,184] - step:347/900 train_loss:1.0470 lr:0.0003000000 time/step:184.20s
|
| 393 |
+
[2025-11-29 09:22:05,583] - step:348/900 train_loss:1.0343 lr:0.0003000000 time/step:186.39s
|
| 394 |
+
[2025-11-29 09:25:09,502] - step:349/900 train_loss:1.0429 lr:0.0003000000 time/step:183.90s
|
| 395 |
+
[2025-11-29 09:28:14,785] - step:350/900 train_loss:1.0173 lr:0.0003000000 time/step:185.28s
|
| 396 |
+
[2025-11-29 09:31:22,664] - step:351/900 train_loss:1.0260 lr:0.0003000000 time/step:187.87s
|
| 397 |
+
[2025-11-29 09:34:27,994] - step:352/900 train_loss:1.0412 lr:0.0003000000 time/step:185.27s
|
| 398 |
+
[2025-11-29 09:37:33,386] - step:353/900 train_loss:1.0051 lr:0.0003000000 time/step:185.37s
|
| 399 |
+
[2025-11-29 09:40:39,936] - step:354/900 train_loss:1.0386 lr:0.0003000000 time/step:186.55s
|
| 400 |
+
[2025-11-29 09:43:45,796] - step:355/900 train_loss:1.0317 lr:0.0003000000 time/step:185.85s
|
| 401 |
+
[2025-11-29 09:46:51,082] - step:356/900 train_loss:1.0060 lr:0.0003000000 time/step:185.26s
|
| 402 |
+
[2025-11-29 09:49:56,919] - step:357/900 train_loss:1.0267 lr:0.0003000000 time/step:185.82s
|
| 403 |
+
[2025-11-29 09:53:05,845] - step:358/900 train_loss:1.0586 lr:0.0003000000 time/step:188.92s
|
| 404 |
+
[2025-11-29 09:56:13,021] - step:359/900 train_loss:1.0340 lr:0.0003000000 time/step:187.15s
|
| 405 |
+
[2025-11-29 09:59:19,033] - step:360/900 train_loss:1.0385 lr:0.0003000000 time/step:186.00s
|
| 406 |
+
[2025-11-29 10:02:25,949] - step:361/900 train_loss:1.0036 lr:0.0003000000 time/step:186.84s
|
| 407 |
+
[2025-11-29 10:05:30,167] - step:362/900 train_loss:1.0181 lr:0.0003000000 time/step:184.18s
|
| 408 |
+
[2025-11-29 10:08:34,860] - step:363/900 train_loss:1.0245 lr:0.0003000000 time/step:184.69s
|
| 409 |
+
[2025-11-29 10:11:40,819] - step:364/900 train_loss:1.0310 lr:0.0003000000 time/step:185.92s
|
| 410 |
+
[2025-11-29 10:14:44,430] - step:365/900 train_loss:1.0431 lr:0.0003000000 time/step:183.59s
|
| 411 |
+
[2025-11-29 10:17:49,210] - step:366/900 train_loss:1.0010 lr:0.0003000000 time/step:184.77s
|
| 412 |
+
[2025-11-29 10:20:56,812] - step:367/900 train_loss:1.0278 lr:0.0003000000 time/step:187.59s
|
| 413 |
+
[2025-11-29 10:24:03,874] - step:368/900 train_loss:1.0450 lr:0.0003000000 time/step:187.04s
|
| 414 |
+
[2025-11-29 10:27:08,644] - step:369/900 train_loss:1.0187 lr:0.0003000000 time/step:184.76s
|
| 415 |
+
[2025-11-29 10:30:12,932] - step:370/900 train_loss:1.0198 lr:0.0003000000 time/step:184.28s
|
| 416 |
+
[2025-11-29 10:33:19,131] - step:371/900 train_loss:1.0267 lr:0.0003000000 time/step:186.19s
|
| 417 |
+
[2025-11-29 10:36:23,611] - step:372/900 train_loss:1.0050 lr:0.0003000000 time/step:184.44s
|
| 418 |
+
[2025-11-29 10:39:27,504] - step:373/900 train_loss:1.0285 lr:0.0003000000 time/step:183.89s
|
| 419 |
+
[2025-11-29 10:42:34,817] - step:374/900 train_loss:1.0273 lr:0.0003000000 time/step:187.31s
|
| 420 |
+
[2025-11-29 10:45:39,564] - step:375/900 train_loss:1.0304 lr:0.0003000000 time/step:184.73s
|
| 421 |
+
[2025-11-29 10:48:44,710] - step:376/900 train_loss:1.0118 lr:0.0003000000 time/step:185.13s
|
| 422 |
+
[2025-11-29 10:51:52,055] - step:377/900 train_loss:1.0109 lr:0.0003000000 time/step:187.34s
|
| 423 |
+
[2025-11-29 10:54:57,418] - step:378/900 train_loss:1.0240 lr:0.0003000000 time/step:185.34s
|
| 424 |
+
[2025-11-29 10:58:02,656] - step:379/900 train_loss:0.9999 lr:0.0003000000 time/step:185.22s
|
| 425 |
+
[2025-11-29 11:01:08,733] - step:380/900 train_loss:1.0321 lr:0.0003000000 time/step:186.07s
|
| 426 |
+
[2025-11-29 11:04:14,600] - step:381/900 train_loss:1.0227 lr:0.0003000000 time/step:185.85s
|
| 427 |
+
[2025-11-29 11:07:19,868] - step:382/900 train_loss:1.0266 lr:0.0003000000 time/step:185.24s
|
| 428 |
+
[2025-11-29 11:10:25,377] - step:383/900 train_loss:1.0351 lr:0.0003000000 time/step:185.51s
|
| 429 |
+
[2025-11-29 11:13:31,325] - step:384/900 train_loss:1.0345 lr:0.0003000000 time/step:185.94s
|
| 430 |
+
[2025-11-29 11:16:37,117] - step:385/900 train_loss:1.0095 lr:0.0003000000 time/step:185.74s
|
| 431 |
+
[2025-11-29 11:19:42,673] - step:386/900 train_loss:1.0084 lr:0.0003000000 time/step:185.53s
|
| 432 |
+
[2025-11-29 11:22:49,798] - step:387/900 train_loss:1.0363 lr:0.0003000000 time/step:187.11s
|
| 433 |
+
[2025-11-29 11:25:54,407] - step:388/900 train_loss:1.0115 lr:0.0003000000 time/step:184.58s
|
| 434 |
+
[2025-11-29 11:29:01,044] - step:389/900 train_loss:1.0391 lr:0.0003000000 time/step:186.63s
|
| 435 |
+
[2025-11-29 11:32:08,861] - step:390/900 train_loss:1.0325 lr:0.0003000000 time/step:187.81s
|
| 436 |
+
[2025-11-29 11:35:13,748] - step:391/900 train_loss:1.0275 lr:0.0003000000 time/step:184.87s
|
| 437 |
+
[2025-11-29 11:38:19,164] - step:392/900 train_loss:1.0071 lr:0.0003000000 time/step:185.41s
|
| 438 |
+
[2025-11-29 11:41:26,071] - step:393/900 train_loss:1.0140 lr:0.0003000000 time/step:186.89s
|
| 439 |
+
[2025-11-29 11:44:30,887] - step:394/900 train_loss:1.0238 lr:0.0003000000 time/step:184.80s
|
| 440 |
+
[2025-11-29 11:47:36,554] - step:395/900 train_loss:1.0223 lr:0.0003000000 time/step:185.63s
|
| 441 |
+
[2025-11-29 11:50:42,929] - step:396/900 train_loss:1.0248 lr:0.0003000000 time/step:186.36s
|
| 442 |
+
[2025-11-29 11:53:49,516] - step:397/900 train_loss:1.0155 lr:0.0003000000 time/step:186.58s
|
| 443 |
+
[2025-11-29 11:56:55,065] - step:398/900 train_loss:1.0266 lr:0.0003000000 time/step:185.52s
|
| 444 |
+
[2025-11-29 12:00:00,180] - step:399/900 train_loss:0.9997 lr:0.0003000000 time/step:185.11s
|
| 445 |
+
[2025-11-29 12:03:08,327] - step:400/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@400.pt...
|
| 446 |
+
[2025-11-29 12:03:08,330] - step:400/900 train_loss:1.0379 lr:0.0003000000 time/step:186.52s
|
| 447 |
+
[2025-11-29 12:06:13,170] - step:401/900 train_loss:1.0278 lr:0.0003000000 time/step:184.80s
|
| 448 |
+
[2025-11-29 12:09:18,331] - step:402/900 train_loss:0.9898 lr:0.0003000000 time/step:185.15s
|
| 449 |
+
[2025-11-29 12:12:24,948] - step:403/900 train_loss:0.9872 lr:0.0003000000 time/step:186.60s
|
| 450 |
+
[2025-11-29 12:15:30,939] - step:404/900 train_loss:1.0125 lr:0.0003000000 time/step:185.98s
|
| 451 |
+
[2025-11-29 12:18:37,715] - step:405/900 train_loss:1.0320 lr:0.0003000000 time/step:186.75s
|
| 452 |
+
[2025-11-29 12:21:43,437] - step:406/900 train_loss:1.0104 lr:0.0003000000 time/step:185.67s
|
| 453 |
+
[2025-11-29 12:24:46,661] - step:407/900 train_loss:1.0245 lr:0.0003000000 time/step:183.20s
|
| 454 |
+
[2025-11-29 12:27:52,506] - step:408/900 train_loss:1.0147 lr:0.0003000000 time/step:185.84s
|
| 455 |
+
[2025-11-29 12:30:57,358] - step:409/900 train_loss:1.0103 lr:0.0003000000 time/step:184.84s
|
| 456 |
+
[2025-11-29 12:34:03,720] - step:410/900 train_loss:0.9781 lr:0.0003000000 time/step:186.36s
|
| 457 |
+
[2025-11-29 12:37:13,118] - step:411/900 train_loss:0.9906 lr:0.0003000000 time/step:189.35s
|
| 458 |
+
[2025-11-29 12:40:19,894] - step:412/900 train_loss:1.0237 lr:0.0003000000 time/step:186.75s
|
| 459 |
+
[2025-11-29 12:43:25,422] - step:413/900 train_loss:1.0114 lr:0.0003000000 time/step:185.52s
|
| 460 |
+
[2025-11-29 12:46:30,579] - step:414/900 train_loss:1.0147 lr:0.0003000000 time/step:184.99s
|
| 461 |
+
[2025-11-29 12:49:38,876] - step:415/900 train_loss:1.0150 lr:0.0003000000 time/step:188.29s
|
| 462 |
+
[2025-11-29 12:52:43,633] - step:416/900 train_loss:1.0239 lr:0.0003000000 time/step:184.73s
|
| 463 |
+
[2025-11-29 12:55:48,060] - step:417/900 train_loss:1.0036 lr:0.0003000000 time/step:184.39s
|
| 464 |
+
[2025-11-29 12:58:55,753] - step:418/900 train_loss:1.0140 lr:0.0003000000 time/step:187.68s
|
| 465 |
+
[2025-11-29 13:02:00,431] - step:419/900 train_loss:1.0039 lr:0.0003000000 time/step:184.66s
|
| 466 |
+
[2025-11-29 13:05:05,089] - step:420/900 train_loss:1.0203 lr:0.0003000000 time/step:184.64s
|
| 467 |
+
[2025-11-29 13:08:12,316] - step:421/900 train_loss:1.0304 lr:0.0003000000 time/step:187.22s
|
| 468 |
+
[2025-11-29 13:11:17,410] - step:422/900 train_loss:1.0034 lr:0.0003000000 time/step:185.08s
|
| 469 |
+
[2025-11-29 13:14:22,416] - step:423/900 train_loss:1.0279 lr:0.0003000000 time/step:185.00s
|
| 470 |
+
[2025-11-29 13:17:27,732] - step:424/900 train_loss:1.0213 lr:0.0003000000 time/step:185.29s
|
| 471 |
+
[2025-11-29 13:20:34,573] - step:425/900 train_loss:0.9987 lr:0.0003000000 time/step:186.70s
|
| 472 |
+
[2025-11-29 13:23:39,421] - step:426/900 train_loss:0.9673 lr:0.0003000000 time/step:184.84s
|
| 473 |
+
[2025-11-29 13:26:44,176] - step:427/900 train_loss:1.0108 lr:0.0003000000 time/step:184.74s
|
| 474 |
+
[2025-11-29 13:29:52,082] - step:428/900 train_loss:1.0243 lr:0.0003000000 time/step:187.87s
|
| 475 |
+
[2025-11-29 13:32:56,984] - step:429/900 train_loss:0.9843 lr:0.0003000000 time/step:184.88s
|
| 476 |
+
[2025-11-29 13:36:01,659] - step:430/900 train_loss:1.0269 lr:0.0003000000 time/step:184.66s
|
| 477 |
+
[2025-11-29 13:39:09,363] - step:431/900 train_loss:1.0047 lr:0.0003000000 time/step:187.70s
|
| 478 |
+
[2025-11-29 13:42:14,007] - step:432/900 train_loss:0.9957 lr:0.0003000000 time/step:184.63s
|
| 479 |
+
[2025-11-29 13:45:17,936] - step:433/900 train_loss:1.0006 lr:0.0003000000 time/step:183.92s
|
| 480 |
+
[2025-11-29 13:48:23,683] - step:434/900 train_loss:1.0080 lr:0.0003000000 time/step:185.74s
|
| 481 |
+
[2025-11-29 13:51:28,718] - step:435/900 train_loss:1.0033 lr:0.0003000000 time/step:185.01s
|
| 482 |
+
[2025-11-29 13:54:33,479] - step:436/900 train_loss:1.0077 lr:0.0003000000 time/step:184.74s
|
| 483 |
+
[2025-11-29 13:57:38,454] - step:437/900 train_loss:0.9913 lr:0.0003000000 time/step:184.96s
|
| 484 |
+
[2025-11-29 14:00:45,973] - step:438/900 train_loss:1.0221 lr:0.0003000000 time/step:187.50s
|
| 485 |
+
[2025-11-29 14:03:50,970] - step:439/900 train_loss:1.0017 lr:0.0003000000 time/step:184.98s
|
| 486 |
+
[2025-11-29 14:06:56,103] - step:440/900 train_loss:0.9966 lr:0.0003000000 time/step:185.11s
|
| 487 |
+
[2025-11-29 14:10:03,916] - step:441/900 train_loss:1.0023 lr:0.0003000000 time/step:187.81s
|
| 488 |
+
[2025-11-29 14:13:09,854] - step:442/900 train_loss:1.0154 lr:0.0003000000 time/step:185.93s
|
| 489 |
+
[2025-11-29 14:16:13,300] - step:443/900 train_loss:0.9993 lr:0.0003000000 time/step:183.43s
|
| 490 |
+
[2025-11-29 14:19:19,989] - step:444/900 train_loss:1.0085 lr:0.0003000000 time/step:186.68s
|
| 491 |
+
[2025-11-29 14:22:23,752] - step:445/900 train_loss:0.9978 lr:0.0003000000 time/step:183.75s
|
| 492 |
+
[2025-11-29 14:25:27,620] - step:446/900 train_loss:1.0148 lr:0.0003000000 time/step:183.84s
|
| 493 |
+
[2025-11-29 14:28:33,765] - step:447/900 train_loss:0.9874 lr:0.0003000000 time/step:186.14s
|
| 494 |
+
[2025-11-29 14:31:37,881] - step:448/900 train_loss:1.0202 lr:0.0003000000 time/step:184.10s
|
| 495 |
+
[2025-11-29 14:34:41,135] - step:449/900 train_loss:0.9902 lr:0.0003000000 time/step:183.23s
|
| 496 |
+
[2025-11-29 14:37:45,361] - step:450/900 train_loss:1.0036 lr:0.0003000000 time/step:184.22s
|
| 497 |
+
[2025-11-29 14:40:53,203] - step:451/900 train_loss:1.0127 lr:0.0003000000 time/step:187.83s
|
| 498 |
+
[2025-11-29 14:43:58,011] - step:452/900 train_loss:1.0339 lr:0.0003000000 time/step:184.77s
|
| 499 |
+
[2025-11-29 14:47:02,348] - step:453/900 train_loss:0.9934 lr:0.0003000000 time/step:184.30s
|
| 500 |
+
[2025-11-29 14:50:10,497] - step:454/900 train_loss:1.0175 lr:0.0003000000 time/step:188.14s
|
| 501 |
+
[2025-11-29 14:53:14,572] - step:455/900 train_loss:1.0011 lr:0.0003000000 time/step:184.06s
|
| 502 |
+
[2025-11-29 14:56:19,257] - step:456/900 train_loss:1.0329 lr:0.0003000000 time/step:184.66s
|
| 503 |
+
[2025-11-29 14:59:26,311] - step:457/900 train_loss:0.9970 lr:0.0003000000 time/step:187.05s
|
| 504 |
+
[2025-11-29 15:02:31,228] - step:458/900 train_loss:0.9849 lr:0.0003000000 time/step:184.91s
|
| 505 |
+
[2025-11-29 15:05:35,912] - step:459/900 train_loss:1.0443 lr:0.0003000000 time/step:184.67s
|
| 506 |
+
[2025-11-29 15:08:44,234] - step:460/900 train_loss:1.0166 lr:0.0003000000 time/step:188.30s
|
| 507 |
+
[2025-11-29 15:11:49,196] - step:461/900 train_loss:0.9857 lr:0.0003000000 time/step:184.94s
|
| 508 |
+
[2025-11-29 15:14:54,073] - step:462/900 train_loss:0.9887 lr:0.0003000000 time/step:184.87s
|
| 509 |
+
[2025-11-29 15:18:01,015] - step:463/900 train_loss:1.0142 lr:0.0003000000 time/step:186.91s
|
| 510 |
+
[2025-11-29 15:21:10,436] - step:464/900 train_loss:1.0084 lr:0.0003000000 time/step:189.42s
|
| 511 |
+
[2025-11-29 15:24:17,825] - step:465/900 train_loss:1.0079 lr:0.0003000000 time/step:187.37s
|
| 512 |
+
[2025-11-29 15:27:21,991] - step:466/900 train_loss:0.9989 lr:0.0003000000 time/step:184.15s
|
| 513 |
+
[2025-11-29 15:30:29,430] - step:467/900 train_loss:1.0027 lr:0.0003000000 time/step:187.42s
|
| 514 |
+
[2025-11-29 15:33:34,038] - step:468/900 train_loss:0.9864 lr:0.0003000000 time/step:184.56s
|
| 515 |
+
[2025-11-29 15:36:38,606] - step:469/900 train_loss:0.9922 lr:0.0003000000 time/step:184.56s
|
| 516 |
+
[2025-11-29 15:39:50,010] - step:470/900 train_loss:1.0046 lr:0.0003000000 time/step:191.39s
|
| 517 |
+
[2025-11-29 15:42:54,426] - step:471/900 train_loss:0.9947 lr:0.0003000000 time/step:184.39s
|
| 518 |
+
[2025-11-29 15:45:58,386] - step:472/900 train_loss:0.9856 lr:0.0003000000 time/step:183.94s
|
| 519 |
+
[2025-11-29 15:49:06,443] - step:473/900 train_loss:1.0102 lr:0.0003000000 time/step:188.03s
|
| 520 |
+
[2025-11-29 15:52:11,651] - step:474/900 train_loss:0.9815 lr:0.0003000000 time/step:185.17s
|
| 521 |
+
[2025-11-29 15:55:16,024] - step:475/900 train_loss:0.9870 lr:0.0003000000 time/step:184.37s
|
| 522 |
+
[2025-11-29 15:58:20,934] - step:476/900 train_loss:0.9902 lr:0.0003000000 time/step:184.90s
|
| 523 |
+
[2025-11-29 16:01:34,770] - step:477/900 train_loss:1.0044 lr:0.0003000000 time/step:193.83s
|
| 524 |
+
[2025-11-29 16:04:40,969] - step:478/900 train_loss:0.9706 lr:0.0003000000 time/step:186.18s
|
| 525 |
+
[2025-11-29 16:07:46,966] - step:479/900 train_loss:0.9861 lr:0.0003000000 time/step:185.98s
|
| 526 |
+
[2025-11-29 16:11:03,013] - step:480/900 train_loss:1.0035 lr:0.0003000000 time/step:196.03s
|
| 527 |
+
[2025-11-29 16:14:06,891] - step:481/900 train_loss:0.9746 lr:0.0003000000 time/step:183.84s
|
| 528 |
+
[2025-11-29 16:17:10,864] - step:482/900 train_loss:0.9883 lr:0.0003000000 time/step:183.95s
|
| 529 |
+
[2025-11-29 16:20:17,244] - step:483/900 train_loss:1.0245 lr:0.0003000000 time/step:186.37s
|
| 530 |
+
[2025-11-29 16:23:21,291] - step:484/900 train_loss:1.0193 lr:0.0003000000 time/step:184.03s
|
| 531 |
+
[2025-11-29 16:26:24,937] - step:485/900 train_loss:0.9953 lr:0.0003000000 time/step:183.63s
|
| 532 |
+
[2025-11-29 16:29:32,575] - step:486/900 train_loss:0.9787 lr:0.0003000000 time/step:187.63s
|
| 533 |
+
[2025-11-29 16:32:37,230] - step:487/900 train_loss:0.9812 lr:0.0003000000 time/step:184.64s
|
| 534 |
+
[2025-11-29 16:35:41,884] - step:488/900 train_loss:0.9911 lr:0.0003000000 time/step:184.65s
|
| 535 |
+
[2025-11-29 16:38:47,753] - step:489/900 train_loss:0.9665 lr:0.0003000000 time/step:185.84s
|
| 536 |
+
[2025-11-29 16:41:53,739] - step:490/900 train_loss:0.9663 lr:0.0003000000 time/step:185.97s
|
| 537 |
+
[2025-11-29 16:44:58,613] - step:491/900 train_loss:1.0147 lr:0.0003000000 time/step:184.87s
|
| 538 |
+
[2025-11-29 16:48:03,370] - step:492/900 train_loss:1.0107 lr:0.0003000000 time/step:184.74s
|
| 539 |
+
[2025-11-29 16:51:11,045] - step:493/900 train_loss:0.9999 lr:0.0003000000 time/step:187.65s
|
| 540 |
+
[2025-11-29 16:54:15,696] - step:494/900 train_loss:0.9875 lr:0.0003000000 time/step:184.64s
|
| 541 |
+
[2025-11-29 16:57:20,229] - step:495/900 train_loss:0.9990 lr:0.0003000000 time/step:184.53s
|
| 542 |
+
[2025-11-29 17:00:26,562] - step:496/900 train_loss:0.9889 lr:0.0003000000 time/step:186.31s
|
| 543 |
+
[2025-11-29 17:03:30,547] - step:497/900 train_loss:0.9835 lr:0.0003000000 time/step:183.97s
|
| 544 |
+
[2025-11-29 17:06:34,456] - step:498/900 train_loss:1.0062 lr:0.0003000000 time/step:183.89s
|
| 545 |
+
[2025-11-29 17:09:40,945] - step:499/900 train_loss:0.9785 lr:0.0003000000 time/step:186.48s
|
| 546 |
+
[2025-11-29 17:12:49,048] - step:500/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@500.pt...
|
| 547 |
+
[2025-11-29 17:12:49,051] - step:500/900 train_loss:1.0054 lr:0.0003000000 time/step:186.42s
|
| 548 |
+
[2025-11-29 17:15:53,725] - step:501/900 train_loss:0.9961 lr:0.0003000000 time/step:184.67s
|
| 549 |
+
[2025-11-29 17:18:59,020] - step:502/900 train_loss:1.0013 lr:0.0003000000 time/step:185.28s
|
| 550 |
+
[2025-11-29 17:22:06,405] - step:503/900 train_loss:0.9746 lr:0.0003000000 time/step:187.35s
|
| 551 |
+
[2025-11-29 17:25:11,190] - step:504/900 train_loss:0.9977 lr:0.0003000000 time/step:184.76s
|
| 552 |
+
[2025-11-29 17:28:16,344] - step:505/900 train_loss:0.9737 lr:0.0003000000 time/step:185.15s
|
| 553 |
+
[2025-11-29 17:31:24,506] - step:506/900 train_loss:1.0010 lr:0.0003000000 time/step:188.14s
|
| 554 |
+
[2025-11-29 17:34:29,492] - step:507/900 train_loss:0.9852 lr:0.0003000000 time/step:184.96s
|
| 555 |
+
[2025-11-29 17:37:34,522] - step:508/900 train_loss:0.9887 lr:0.0003000000 time/step:185.00s
|
| 556 |
+
[2025-11-29 17:40:41,236] - step:509/900 train_loss:0.9830 lr:0.0003000000 time/step:186.70s
|
| 557 |
+
[2025-11-29 17:43:46,307] - step:510/900 train_loss:0.9844 lr:0.0003000000 time/step:185.05s
|
| 558 |
+
[2025-11-29 17:46:49,527] - step:511/900 train_loss:0.9718 lr:0.0003000000 time/step:183.20s
|
| 559 |
+
[2025-11-29 17:49:52,170] - step:512/900 train_loss:0.9866 lr:0.0003000000 time/step:182.64s
|
| 560 |
+
[2025-11-29 17:52:55,148] - step:513/900 train_loss:1.0106 lr:0.0003000000 time/step:182.96s
|
| 561 |
+
[2025-11-29 17:55:59,252] - step:514/900 train_loss:0.9629 lr:0.0003000000 time/step:184.09s
|
| 562 |
+
[2025-11-29 17:59:04,675] - step:515/900 train_loss:1.0048 lr:0.0003000000 time/step:185.41s
|
| 563 |
+
[2025-11-29 18:02:13,006] - step:516/900 train_loss:0.9964 lr:0.0003000000 time/step:188.32s
|
| 564 |
+
[2025-11-29 18:05:16,855] - step:517/900 train_loss:1.0057 lr:0.0003000000 time/step:183.84s
|
| 565 |
+
[2025-11-29 18:08:20,622] - step:518/900 train_loss:0.9859 lr:0.0003000000 time/step:183.75s
|
| 566 |
+
[2025-11-29 18:11:26,793] - step:519/900 train_loss:0.9714 lr:0.0003000000 time/step:186.16s
|
| 567 |
+
[2025-11-29 18:14:29,889] - step:520/900 train_loss:0.9652 lr:0.0003000000 time/step:183.08s
|
| 568 |
+
[2025-11-29 18:17:33,349] - step:521/900 train_loss:0.9786 lr:0.0003000000 time/step:183.43s
|
| 569 |
+
[2025-11-29 18:20:39,275] - step:522/900 train_loss:0.9721 lr:0.0003000000 time/step:185.92s
|
| 570 |
+
[2025-11-29 18:23:43,034] - step:523/900 train_loss:0.9862 lr:0.0003000000 time/step:183.75s
|
| 571 |
+
[2025-11-29 18:26:46,732] - step:524/900 train_loss:0.9942 lr:0.0003000000 time/step:183.66s
|
| 572 |
+
[2025-11-29 18:29:50,749] - step:525/900 train_loss:0.9850 lr:0.0003000000 time/step:184.01s
|
| 573 |
+
[2025-11-29 18:32:55,277] - step:526/900 train_loss:0.9804 lr:0.0003000000 time/step:184.51s
|
| 574 |
+
[2025-11-29 18:36:00,371] - step:527/900 train_loss:0.9845 lr:0.0003000000 time/step:185.08s
|
| 575 |
+
[2025-11-29 18:39:06,966] - step:528/900 train_loss:0.9832 lr:0.0003000000 time/step:186.57s
|
| 576 |
+
[2025-11-29 18:42:15,798] - step:529/900 train_loss:0.9967 lr:0.0003000000 time/step:188.82s
|
| 577 |
+
[2025-11-29 18:45:22,445] - step:530/900 train_loss:0.9910 lr:0.0003000000 time/step:186.63s
|
| 578 |
+
[2025-11-29 18:48:29,542] - step:531/900 train_loss:0.9714 lr:0.0003000000 time/step:187.07s
|
| 579 |
+
[2025-11-29 18:51:38,501] - step:532/900 train_loss:0.9868 lr:0.0003000000 time/step:188.95s
|
| 580 |
+
[2025-11-29 18:54:45,821] - step:533/900 train_loss:0.9929 lr:0.0003000000 time/step:187.30s
|
| 581 |
+
[2025-11-29 18:57:53,314] - step:534/900 train_loss:0.9879 lr:0.0003000000 time/step:187.47s
|
| 582 |
+
[2025-11-29 19:01:01,583] - step:535/900 train_loss:1.0067 lr:0.0003000000 time/step:188.26s
|
| 583 |
+
[2025-11-29 19:04:09,493] - step:536/900 train_loss:0.9836 lr:0.0003000000 time/step:187.89s
|
| 584 |
+
[2025-11-29 19:07:16,734] - step:537/900 train_loss:0.9868 lr:0.0003000000 time/step:187.21s
|
| 585 |
+
[2025-11-29 19:10:24,993] - step:538/900 train_loss:0.9951 lr:0.0003000000 time/step:188.24s
|
| 586 |
+
[2025-11-29 19:13:31,431] - step:539/900 train_loss:0.9761 lr:0.0003000000 time/step:186.41s
|
| 587 |
+
[2025-11-29 19:16:36,819] - step:540/900 train_loss:0.9742 lr:0.0003000000 time/step:185.38s
|
| 588 |
+
[2025-11-29 19:19:43,799] - step:541/900 train_loss:0.9745 lr:0.0003000000 time/step:186.95s
|
| 589 |
+
[2025-11-29 19:22:52,532] - step:542/900 train_loss:0.9817 lr:0.0003000000 time/step:188.73s
|
| 590 |
+
[2025-11-29 19:25:59,216] - step:543/900 train_loss:0.9777 lr:0.0003000000 time/step:186.67s
|
| 591 |
+
[2025-11-29 19:29:05,849] - step:544/900 train_loss:0.9960 lr:0.0003000000 time/step:186.61s
|
| 592 |
+
[2025-11-29 19:32:14,558] - step:545/900 train_loss:0.9811 lr:0.0003000000 time/step:188.70s
|
| 593 |
+
[2025-11-29 19:35:21,607] - step:546/900 train_loss:0.9882 lr:0.0003000000 time/step:187.02s
|
| 594 |
+
[2025-11-29 19:38:28,518] - step:547/900 train_loss:0.9938 lr:0.0003000000 time/step:186.88s
|
| 595 |
+
[2025-11-29 19:41:37,225] - step:548/900 train_loss:0.9407 lr:0.0003000000 time/step:188.70s
|
| 596 |
+
[2025-11-29 19:44:44,063] - step:549/900 train_loss:0.9774 lr:0.0003000000 time/step:186.81s
|
| 597 |
+
[2025-11-29 19:47:50,816] - step:550/900 train_loss:0.9913 lr:0.0003000000 time/step:186.73s
|
| 598 |
+
[2025-11-29 19:50:58,844] - step:551/900 train_loss:0.9948 lr:0.0003000000 time/step:188.02s
|
| 599 |
+
[2025-11-29 19:54:06,212] - step:552/900 train_loss:0.9696 lr:0.0003000000 time/step:187.35s
|
| 600 |
+
[2025-11-29 19:57:12,084] - step:553/900 train_loss:0.9706 lr:0.0003000000 time/step:185.85s
|
| 601 |
+
[2025-11-29 20:00:18,128] - step:554/900 train_loss:0.9871 lr:0.0003000000 time/step:186.03s
|
| 602 |
+
[2025-11-29 20:03:26,623] - step:555/900 train_loss:0.9930 lr:0.0003000000 time/step:188.48s
|
| 603 |
+
[2025-11-29 20:06:33,230] - step:556/900 train_loss:0.9752 lr:0.0003000000 time/step:186.55s
|
| 604 |
+
[2025-11-29 20:09:39,696] - step:557/900 train_loss:0.9850 lr:0.0003000000 time/step:186.45s
|
| 605 |
+
[2025-11-29 20:12:48,229] - step:558/900 train_loss:0.9720 lr:0.0003000000 time/step:188.52s
|
| 606 |
+
[2025-11-29 20:15:53,987] - step:559/900 train_loss:0.9962 lr:0.0003000000 time/step:185.74s
|
| 607 |
+
[2025-11-29 20:19:00,484] - step:560/900 train_loss:0.9922 lr:0.0003000000 time/step:186.48s
|
| 608 |
+
[2025-11-29 20:22:09,247] - step:561/900 train_loss:0.9740 lr:0.0003000000 time/step:188.74s
|
| 609 |
+
[2025-11-29 20:25:16,473] - step:562/900 train_loss:0.9712 lr:0.0003000000 time/step:187.21s
|
| 610 |
+
[2025-11-29 20:28:23,403] - step:563/900 train_loss:0.9612 lr:0.0003000000 time/step:186.92s
|
| 611 |
+
[2025-11-29 20:31:30,909] - step:564/900 train_loss:0.9914 lr:0.0003000000 time/step:187.50s
|
| 612 |
+
[2025-11-29 20:34:38,710] - step:565/900 train_loss:0.9836 lr:0.0003000000 time/step:187.78s
|
| 613 |
+
[2025-11-29 20:37:45,056] - step:566/900 train_loss:0.9814 lr:0.0003000000 time/step:186.33s
|
| 614 |
+
[2025-11-29 20:40:51,873] - step:567/900 train_loss:0.9865 lr:0.0003000000 time/step:186.81s
|
| 615 |
+
[2025-11-29 20:44:00,559] - step:568/900 train_loss:0.9917 lr:0.0003000000 time/step:188.68s
|
| 616 |
+
[2025-11-29 20:47:07,062] - step:569/900 train_loss:0.9644 lr:0.0003000000 time/step:186.48s
|
| 617 |
+
[2025-11-29 20:50:13,303] - step:570/900 train_loss:0.9759 lr:0.0003000000 time/step:186.19s
|
| 618 |
+
[2025-11-29 20:53:21,695] - step:571/900 train_loss:0.9703 lr:0.0003000000 time/step:188.39s
|
| 619 |
+
[2025-11-29 20:56:29,148] - step:572/900 train_loss:0.9713 lr:0.0003000000 time/step:187.43s
|
| 620 |
+
[2025-11-29 20:59:35,993] - step:573/900 train_loss:0.9549 lr:0.0003000000 time/step:186.82s
|
| 621 |
+
[2025-11-29 21:02:44,463] - step:574/900 train_loss:0.9696 lr:0.0003000000 time/step:188.47s
|
| 622 |
+
[2025-11-29 21:05:51,247] - step:575/900 train_loss:0.9648 lr:0.0003000000 time/step:186.77s
|
| 623 |
+
[2025-11-29 21:08:57,001] - step:576/900 train_loss:0.9695 lr:0.0003000000 time/step:185.74s
|
| 624 |
+
[2025-11-29 21:12:03,873] - step:577/900 train_loss:0.9728 lr:0.0003000000 time/step:186.86s
|
| 625 |
+
[2025-11-29 21:15:10,900] - step:578/900 train_loss:0.9767 lr:0.0003000000 time/step:187.02s
|
| 626 |
+
[2025-11-29 21:18:14,501] - step:579/900 train_loss:0.9643 lr:0.0003000000 time/step:183.56s
|
| 627 |
+
[2025-11-29 21:21:16,045] - step:580/900 train_loss:0.9826 lr:0.0003000000 time/step:181.53s
|
| 628 |
+
[2025-11-29 21:24:19,527] - step:581/900 train_loss:0.9792 lr:0.0003000000 time/step:183.48s
|
| 629 |
+
[2025-11-29 21:27:25,340] - step:582/900 train_loss:0.9852 lr:0.0003000000 time/step:185.73s
|
| 630 |
+
[2025-11-29 21:30:32,498] - step:583/900 train_loss:0.9699 lr:0.0003000000 time/step:187.15s
|
| 631 |
+
[2025-11-29 21:33:40,663] - step:584/900 train_loss:0.9709 lr:0.0003000000 time/step:188.14s
|
| 632 |
+
[2025-11-29 21:36:47,891] - step:585/900 train_loss:0.9673 lr:0.0003000000 time/step:187.21s
|
| 633 |
+
[2025-11-29 21:39:54,798] - step:586/900 train_loss:0.9792 lr:0.0003000000 time/step:186.90s
|
| 634 |
+
[2025-11-29 21:43:04,568] - step:587/900 train_loss:0.9784 lr:0.0003000000 time/step:189.77s
|
| 635 |
+
[2025-11-29 21:46:11,882] - step:588/900 train_loss:0.9719 lr:0.0003000000 time/step:187.29s
|
| 636 |
+
[2025-11-29 21:49:18,906] - step:589/900 train_loss:0.9834 lr:0.0003000000 time/step:187.01s
|
| 637 |
+
[2025-11-29 21:52:25,621] - step:590/900 train_loss:0.9659 lr:0.0003000000 time/step:186.70s
|
| 638 |
+
[2025-11-29 21:55:31,655] - step:591/900 train_loss:0.9658 lr:0.0003000000 time/step:185.94s
|
| 639 |
+
[2025-11-29 21:58:38,212] - step:592/900 train_loss:0.9855 lr:0.0003000000 time/step:186.53s
|
| 640 |
+
[2025-11-29 22:01:44,812] - step:593/900 train_loss:0.9691 lr:0.0003000000 time/step:186.59s
|
| 641 |
+
[2025-11-29 22:04:51,951] - step:594/900 train_loss:0.9781 lr:0.0003000000 time/step:187.13s
|
| 642 |
+
[2025-11-29 22:07:57,915] - step:595/900 train_loss:0.9579 lr:0.0003000000 time/step:185.94s
|
| 643 |
+
[2025-11-29 22:11:04,854] - step:596/900 train_loss:0.9731 lr:0.0003000000 time/step:186.91s
|
| 644 |
+
[2025-11-29 22:14:13,434] - step:597/900 train_loss:0.9715 lr:0.0003000000 time/step:188.57s
|
| 645 |
+
[2025-11-29 22:17:20,910] - step:598/900 train_loss:0.9886 lr:0.0003000000 time/step:187.46s
|
| 646 |
+
[2025-11-29 22:20:27,176] - step:599/900 train_loss:0.9657 lr:0.0003000000 time/step:186.24s
|
| 647 |
+
[2025-11-29 22:23:34,717] - step:600/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@600.pt...
|
| 648 |
+
[2025-11-29 22:23:34,723] - step:600/900 train_loss:0.9532 lr:0.0003000000 time/step:185.95s
|
| 649 |
+
[2025-11-29 22:26:38,518] - step:601/900 train_loss:0.9535 lr:0.0003000000 time/step:183.79s
|
| 650 |
+
[2025-11-29 22:29:41,900] - step:602/900 train_loss:0.9374 lr:0.0003000000 time/step:183.35s
|
| 651 |
+
[2025-11-29 22:32:47,468] - step:603/900 train_loss:0.9662 lr:0.0003000000 time/step:185.52s
|
| 652 |
+
[2025-11-29 22:35:53,752] - step:604/900 train_loss:0.9587 lr:0.0003000000 time/step:186.16s
|
| 653 |
+
[2025-11-29 22:38:58,466] - step:605/900 train_loss:0.9739 lr:0.0003000000 time/step:184.70s
|
| 654 |
+
[2025-11-29 22:42:03,657] - step:606/900 train_loss:0.9563 lr:0.0003000000 time/step:185.17s
|
| 655 |
+
[2025-11-29 22:45:12,058] - step:607/900 train_loss:0.9584 lr:0.0003000000 time/step:188.39s
|
| 656 |
+
[2025-11-29 22:48:18,310] - step:608/900 train_loss:0.9694 lr:0.0003000000 time/step:186.23s
|
| 657 |
+
[2025-11-29 22:51:24,367] - step:609/900 train_loss:0.9681 lr:0.0003000000 time/step:186.05s
|
| 658 |
+
[2025-11-29 22:54:31,573] - step:610/900 train_loss:0.9582 lr:0.0003000000 time/step:187.20s
|
| 659 |
+
[2025-11-29 22:57:36,240] - step:611/900 train_loss:0.9781 lr:0.0003000000 time/step:184.66s
|
| 660 |
+
[2025-11-29 23:00:39,793] - step:612/900 train_loss:0.9707 lr:0.0003000000 time/step:183.54s
|
| 661 |
+
[2025-11-29 23:03:48,177] - step:613/900 train_loss:0.9626 lr:0.0003000000 time/step:188.38s
|
| 662 |
+
[2025-11-29 23:06:54,527] - step:614/900 train_loss:0.9525 lr:0.0003000000 time/step:186.34s
|
| 663 |
+
[2025-11-29 23:10:00,576] - step:615/900 train_loss:0.9825 lr:0.0003000000 time/step:186.03s
|
| 664 |
+
[2025-11-29 23:13:06,944] - step:616/900 train_loss:0.9648 lr:0.0003000000 time/step:186.35s
|
| 665 |
+
[2025-11-29 23:16:13,313] - step:617/900 train_loss:0.9833 lr:0.0003000000 time/step:186.36s
|
| 666 |
+
[2025-11-29 23:19:18,008] - step:618/900 train_loss:0.9619 lr:0.0003000000 time/step:184.67s
|
| 667 |
+
[2025-11-29 23:22:23,418] - step:619/900 train_loss:0.9681 lr:0.0003000000 time/step:185.40s
|
| 668 |
+
[2025-11-29 23:25:30,799] - step:620/900 train_loss:0.9705 lr:0.0003000000 time/step:187.36s
|
| 669 |
+
[2025-11-29 23:28:36,096] - step:621/900 train_loss:0.9884 lr:0.0003000000 time/step:185.28s
|
| 670 |
+
[2025-11-29 23:31:40,935] - step:622/900 train_loss:0.9623 lr:0.0003000000 time/step:184.83s
|
| 671 |
+
[2025-11-29 23:34:49,164] - step:623/900 train_loss:0.9781 lr:0.0003000000 time/step:188.22s
|
| 672 |
+
[2025-11-29 23:37:55,808] - step:624/900 train_loss:0.9558 lr:0.0003000000 time/step:186.62s
|
| 673 |
+
[2025-11-29 23:41:02,902] - step:625/900 train_loss:0.9641 lr:0.0003000000 time/step:187.08s
|
| 674 |
+
[2025-11-29 23:44:12,190] - step:626/900 train_loss:0.9631 lr:0.0003000000 time/step:189.26s
|
| 675 |
+
[2025-11-29 23:47:18,211] - step:627/900 train_loss:0.9820 lr:0.0003000000 time/step:185.99s
|
| 676 |
+
[2025-11-29 23:50:22,907] - step:628/900 train_loss:0.9647 lr:0.0003000000 time/step:184.67s
|
| 677 |
+
[2025-11-29 23:53:29,293] - step:629/900 train_loss:0.9504 lr:0.0003000000 time/step:186.38s
|
| 678 |
+
[2025-11-29 23:56:35,007] - step:630/900 train_loss:0.9845 lr:0.0003000000 time/step:185.70s
|
| 679 |
+
[2025-11-29 23:59:41,063] - step:631/900 train_loss:0.9710 lr:0.0003000000 time/step:186.04s
|
| 680 |
+
[2025-11-30 00:02:47,384] - step:632/900 train_loss:0.9673 lr:0.0003000000 time/step:186.31s
|
| 681 |
+
[2025-11-30 00:05:54,675] - step:633/900 train_loss:0.9644 lr:0.0003000000 time/step:187.29s
|
| 682 |
+
[2025-11-30 00:09:00,681] - step:634/900 train_loss:0.9751 lr:0.0003000000 time/step:185.98s
|
| 683 |
+
[2025-11-30 00:12:07,170] - step:635/900 train_loss:0.9427 lr:0.0003000000 time/step:186.47s
|
| 684 |
+
[2025-11-30 00:15:16,394] - step:636/900 train_loss:0.9941 lr:0.0003000000 time/step:189.21s
|
| 685 |
+
[2025-11-30 00:18:21,885] - step:637/900 train_loss:0.9627 lr:0.0003000000 time/step:185.46s
|
| 686 |
+
[2025-11-30 00:21:26,909] - step:638/900 train_loss:0.9713 lr:0.0003000000 time/step:185.01s
|
| 687 |
+
[2025-11-30 00:24:34,518] - step:639/900 train_loss:0.9477 lr:0.0003000000 time/step:187.59s
|
| 688 |
+
[2025-11-30 00:27:39,860] - step:640/900 train_loss:0.9413 lr:0.0003000000 time/step:185.32s
|
| 689 |
+
[2025-11-30 00:30:46,082] - step:641/900 train_loss:0.9583 lr:0.0003000000 time/step:186.18s
|
| 690 |
+
[2025-11-30 00:33:53,085] - step:642/900 train_loss:0.9927 lr:0.0003000000 time/step:186.99s
|
| 691 |
+
[2025-11-30 00:37:00,236] - step:643/900 train_loss:0.9658 lr:0.0003000000 time/step:187.13s
|
| 692 |
+
[2025-11-30 00:40:06,191] - step:644/900 train_loss:0.9532 lr:0.0003000000 time/step:185.92s
|
| 693 |
+
[2025-11-30 00:43:11,626] - step:645/900 train_loss:0.9510 lr:0.0003000000 time/step:185.43s
|
| 694 |
+
[2025-11-30 00:46:16,854] - step:646/900 train_loss:0.9572 lr:0.0003000000 time/step:185.21s
|
| 695 |
+
[2025-11-30 00:49:20,350] - step:647/900 train_loss:0.9524 lr:0.0003000000 time/step:183.47s
|
| 696 |
+
[2025-11-30 00:52:23,936] - step:648/900 train_loss:0.9724 lr:0.0003000000 time/step:183.58s
|
| 697 |
+
[2025-11-30 00:55:32,534] - step:649/900 train_loss:1.0075 lr:0.0003000000 time/step:188.59s
|
| 698 |
+
[2025-11-30 00:58:37,981] - step:650/900 train_loss:0.9637 lr:0.0003000000 time/step:185.43s
|
| 699 |
+
[2025-11-30 01:01:43,633] - step:651/900 train_loss:0.9657 lr:0.0003000000 time/step:185.63s
|
| 700 |
+
[2025-11-30 01:04:53,089] - step:652/900 train_loss:0.9597 lr:0.0003000000 time/step:189.45s
|
| 701 |
+
[2025-11-30 01:08:00,352] - step:653/900 train_loss:0.9692 lr:0.0003000000 time/step:187.22s
|
| 702 |
+
[2025-11-30 01:11:07,645] - step:654/900 train_loss:0.9529 lr:0.0003000000 time/step:187.28s
|
| 703 |
+
[2025-11-30 01:14:14,239] - step:655/900 train_loss:0.9482 lr:0.0003000000 time/step:186.59s
|
| 704 |
+
[2025-11-30 01:17:20,123] - step:656/900 train_loss:0.9579 lr:0.0003000000 time/step:185.88s
|
| 705 |
+
[2025-11-30 01:20:25,496] - step:657/900 train_loss:0.9504 lr:0.0003000000 time/step:185.35s
|
| 706 |
+
[2025-11-30 01:23:31,180] - step:658/900 train_loss:0.9749 lr:0.0003000000 time/step:185.66s
|
| 707 |
+
[2025-11-30 01:26:37,555] - step:659/900 train_loss:0.9706 lr:0.0003000000 time/step:186.35s
|
| 708 |
+
[2025-11-30 01:29:43,411] - step:660/900 train_loss:0.9571 lr:0.0003000000 time/step:185.84s
|
| 709 |
+
[2025-11-30 01:32:49,562] - step:661/900 train_loss:0.9464 lr:0.0003000000 time/step:186.14s
|
| 710 |
+
[2025-11-30 01:35:57,969] - step:662/900 train_loss:0.9430 lr:0.0003000000 time/step:188.40s
|
| 711 |
+
[2025-11-30 01:39:04,057] - step:663/900 train_loss:0.9606 lr:0.0003000000 time/step:186.06s
|
| 712 |
+
[2025-11-30 01:42:08,918] - step:664/900 train_loss:0.9484 lr:0.0003000000 time/step:184.85s
|
| 713 |
+
[2025-11-30 01:45:15,790] - step:665/900 train_loss:0.9660 lr:0.0003000000 time/step:186.86s
|
| 714 |
+
[2025-11-30 01:48:21,042] - step:666/900 train_loss:0.9715 lr:0.0003000000 time/step:185.22s
|
| 715 |
+
[2025-11-30 01:51:25,399] - step:667/900 train_loss:0.9747 lr:0.0003000000 time/step:184.34s
|
| 716 |
+
[2025-11-30 01:54:31,595] - step:668/900 train_loss:0.9405 lr:0.0003000000 time/step:186.18s
|
| 717 |
+
[2025-11-30 01:57:37,951] - step:669/900 train_loss:0.9562 lr:0.0003000000 time/step:186.34s
|
| 718 |
+
[2025-11-30 02:00:44,059] - step:670/900 train_loss:0.9800 lr:0.0003000000 time/step:186.09s
|
| 719 |
+
[2025-11-30 02:03:49,586] - step:671/900 train_loss:0.9646 lr:0.0003000000 time/step:185.52s
|
| 720 |
+
[2025-11-30 02:06:57,124] - step:672/900 train_loss:0.9656 lr:0.0003000000 time/step:187.53s
|
| 721 |
+
[2025-11-30 02:10:03,956] - step:673/900 train_loss:0.9544 lr:0.0003000000 time/step:186.80s
|
| 722 |
+
[2025-11-30 02:13:09,941] - step:674/900 train_loss:0.9604 lr:0.0003000000 time/step:185.98s
|
| 723 |
+
[2025-11-30 02:16:17,892] - step:675/900 train_loss:0.9639 lr:0.0003000000 time/step:187.95s
|
| 724 |
+
[2025-11-30 02:19:23,974] - step:676/900 train_loss:0.9455 lr:0.0003000000 time/step:186.05s
|
| 725 |
+
[2025-11-30 02:22:30,221] - step:677/900 train_loss:0.9509 lr:0.0003000000 time/step:186.20s
|
| 726 |
+
[2025-11-30 02:25:37,961] - step:678/900 train_loss:0.9363 lr:0.0003000000 time/step:187.73s
|
| 727 |
+
[2025-11-30 02:28:44,267] - step:679/900 train_loss:0.9520 lr:0.0003000000 time/step:186.29s
|
| 728 |
+
[2025-11-30 02:31:50,617] - step:680/900 train_loss:0.9565 lr:0.0003000000 time/step:186.34s
|
| 729 |
+
[2025-11-30 02:34:58,672] - step:681/900 train_loss:0.9727 lr:0.0003000000 time/step:188.04s
|
| 730 |
+
[2025-11-30 02:38:05,140] - step:682/900 train_loss:0.9563 lr:0.0003000000 time/step:186.46s
|
| 731 |
+
[2025-11-30 02:41:09,992] - step:683/900 train_loss:0.9809 lr:0.0003000000 time/step:184.79s
|
| 732 |
+
[2025-11-30 02:44:15,338] - step:684/900 train_loss:0.9526 lr:0.0003000000 time/step:185.34s
|
| 733 |
+
[2025-11-30 02:47:21,385] - step:685/900 train_loss:0.9675 lr:0.0003000000 time/step:186.04s
|
| 734 |
+
[2025-11-30 02:50:25,872] - step:686/900 train_loss:0.9466 lr:0.0003000000 time/step:184.44s
|
| 735 |
+
[2025-11-30 02:53:31,333] - step:687/900 train_loss:0.9575 lr:0.0003000000 time/step:185.43s
|
| 736 |
+
[2025-11-30 02:56:38,782] - step:688/900 train_loss:0.9673 lr:0.0003000000 time/step:187.43s
|
| 737 |
+
[2025-11-30 02:59:44,702] - step:689/900 train_loss:0.9582 lr:0.0003000000 time/step:185.90s
|
| 738 |
+
[2025-11-30 03:02:50,929] - step:690/900 train_loss:0.9581 lr:0.0003000000 time/step:186.22s
|
| 739 |
+
[2025-11-30 03:05:57,119] - step:691/900 train_loss:0.9407 lr:0.0003000000 time/step:186.18s
|
| 740 |
+
[2025-11-30 03:09:02,467] - step:692/900 train_loss:0.9567 lr:0.0003000000 time/step:185.33s
|
| 741 |
+
[2025-11-30 03:12:07,335] - step:693/900 train_loss:0.9362 lr:0.0003000000 time/step:184.84s
|
| 742 |
+
[2025-11-30 03:15:14,078] - step:694/900 train_loss:0.9692 lr:0.0003000000 time/step:186.74s
|
| 743 |
+
[2025-11-30 03:18:20,680] - step:695/900 train_loss:0.9288 lr:0.0003000000 time/step:186.58s
|
| 744 |
+
[2025-11-30 03:21:26,753] - step:696/900 train_loss:0.9616 lr:0.0003000000 time/step:186.05s
|
| 745 |
+
[2025-11-30 03:24:32,423] - step:697/900 train_loss:0.9203 lr:0.0003000000 time/step:185.66s
|
| 746 |
+
[2025-11-30 03:27:41,442] - step:698/900 train_loss:0.9552 lr:0.0003000000 time/step:189.01s
|
| 747 |
+
[2025-11-30 03:30:45,196] - step:699/900 train_loss:0.9601 lr:0.0003000000 time/step:183.72s
|
| 748 |
+
[2025-11-30 03:33:51,669] - step:700/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@700.pt...
|
| 749 |
+
[2025-11-30 03:33:51,671] - step:700/900 train_loss:0.9515 lr:0.0003000000 time/step:184.76s
|
| 750 |
+
[2025-11-30 03:36:59,452] - step:701/900 train_loss:0.9587 lr:0.0003000000 time/step:187.77s
|
| 751 |
+
[2025-11-30 03:40:05,996] - step:702/900 train_loss:0.9688 lr:0.0003000000 time/step:186.44s
|
| 752 |
+
[2025-11-30 03:43:12,080] - step:703/900 train_loss:0.9386 lr:0.0003000000 time/step:186.06s
|
| 753 |
+
[2025-11-30 03:46:19,965] - step:704/900 train_loss:0.9925 lr:0.0003000000 time/step:187.88s
|
| 754 |
+
[2025-11-30 03:49:25,758] - step:705/900 train_loss:0.9425 lr:0.0003000000 time/step:185.77s
|
| 755 |
+
[2025-11-30 03:52:30,961] - step:706/900 train_loss:0.9720 lr:0.0003000000 time/step:185.19s
|
| 756 |
+
[2025-11-30 03:55:39,418] - step:707/900 train_loss:0.9434 lr:0.0003000000 time/step:188.44s
|
| 757 |
+
[2025-11-30 03:58:47,467] - step:708/900 train_loss:0.9549 lr:0.0003000000 time/step:188.03s
|
| 758 |
+
[2025-11-30 04:01:53,611] - step:709/900 train_loss:0.9511 lr:0.0003000000 time/step:186.12s
|
| 759 |
+
[2025-11-30 04:04:58,923] - step:710/900 train_loss:0.9714 lr:0.0003000000 time/step:185.31s
|
| 760 |
+
[2025-11-30 04:08:06,914] - step:711/900 train_loss:0.9647 lr:0.0003000000 time/step:187.98s
|
| 761 |
+
[2025-11-30 04:11:12,929] - step:712/900 train_loss:0.9789 lr:0.0003000000 time/step:185.96s
|
| 762 |
+
[2025-11-30 04:14:19,154] - step:713/900 train_loss:0.9418 lr:0.0003000000 time/step:186.22s
|
| 763 |
+
[2025-11-30 04:17:27,669] - step:714/900 train_loss:0.9417 lr:0.0003000000 time/step:188.50s
|
| 764 |
+
[2025-11-30 04:20:32,769] - step:715/900 train_loss:0.9507 lr:0.0003000000 time/step:185.08s
|
| 765 |
+
[2025-11-30 04:23:37,756] - step:716/900 train_loss:0.9567 lr:0.0003000000 time/step:184.98s
|
| 766 |
+
[2025-11-30 04:26:45,091] - step:717/900 train_loss:0.9389 lr:0.0003000000 time/step:187.32s
|
| 767 |
+
[2025-11-30 04:29:50,043] - step:718/900 train_loss:0.9477 lr:0.0003000000 time/step:184.87s
|
| 768 |
+
[2025-11-30 04:32:53,971] - step:719/900 train_loss:0.9619 lr:0.0003000000 time/step:183.92s
|
| 769 |
+
[2025-11-30 04:36:00,320] - step:720/900 train_loss:0.9533 lr:0.0003000000 time/step:186.34s
|
| 770 |
+
[2025-11-30 04:39:07,896] - step:721/900 train_loss:0.9650 lr:0.0003000000 time/step:187.55s
|
| 771 |
+
[2025-11-30 04:42:13,833] - step:722/900 train_loss:0.9603 lr:0.0003000000 time/step:185.91s
|
| 772 |
+
[2025-11-30 04:45:20,122] - step:723/900 train_loss:0.9604 lr:0.0003000000 time/step:186.28s
|
| 773 |
+
[2025-11-30 04:48:28,513] - step:724/900 train_loss:0.9635 lr:0.0003000000 time/step:188.38s
|
| 774 |
+
[2025-11-30 04:51:34,485] - step:725/900 train_loss:0.9550 lr:0.0003000000 time/step:185.94s
|
| 775 |
+
[2025-11-30 04:54:40,827] - step:726/900 train_loss:0.9679 lr:0.0003000000 time/step:186.34s
|
| 776 |
+
[2025-11-30 04:57:50,319] - step:727/900 train_loss:0.9607 lr:0.0003000000 time/step:189.46s
|
| 777 |
+
[2025-11-30 05:00:56,724] - step:728/900 train_loss:0.9880 lr:0.0003000000 time/step:186.35s
|
| 778 |
+
[2025-11-30 05:04:02,482] - step:729/900 train_loss:0.9358 lr:0.0003000000 time/step:185.75s
|
| 779 |
+
[2025-11-30 05:07:10,367] - step:730/900 train_loss:0.9521 lr:0.0003000000 time/step:187.88s
|
| 780 |
+
[2025-11-30 05:10:16,528] - step:731/900 train_loss:0.9466 lr:0.0003000000 time/step:186.13s
|
| 781 |
+
[2025-11-30 05:13:22,743] - step:732/900 train_loss:0.9481 lr:0.0003000000 time/step:186.21s
|
| 782 |
+
[2025-11-30 05:16:29,572] - step:733/900 train_loss:0.9613 lr:0.0003000000 time/step:186.81s
|
| 783 |
+
[2025-11-30 05:19:37,538] - step:734/900 train_loss:0.9525 lr:0.0003000000 time/step:187.96s
|
| 784 |
+
[2025-11-30 05:22:40,037] - step:735/900 train_loss:0.9457 lr:0.0003000000 time/step:182.48s
|
| 785 |
+
[2025-11-30 05:25:46,040] - step:736/900 train_loss:0.9572 lr:0.0003000000 time/step:185.97s
|
| 786 |
+
[2025-11-30 05:29:04,236] - step:737/900 train_loss:0.9545 lr:0.0003000000 time/step:196.30s
|
| 787 |
+
[2025-11-30 05:32:09,010] - step:738/900 train_loss:0.9633 lr:0.0003000000 time/step:184.76s
|
| 788 |
+
[2025-11-30 05:35:14,741] - step:739/900 train_loss:0.9598 lr:0.0003000000 time/step:185.72s
|
| 789 |
+
[2025-11-30 05:38:21,357] - step:740/900 train_loss:0.9342 lr:0.0003000000 time/step:186.60s
|
| 790 |
+
[2025-11-30 05:41:26,232] - step:741/900 train_loss:0.9550 lr:0.0003000000 time/step:184.84s
|
| 791 |
+
[2025-11-30 05:44:31,223] - step:742/900 train_loss:0.9696 lr:0.0003000000 time/step:184.98s
|
| 792 |
+
[2025-11-30 05:47:42,033] - step:743/900 train_loss:0.9468 lr:0.0003000000 time/step:190.80s
|
| 793 |
+
[2025-11-30 05:50:47,075] - step:744/900 train_loss:0.9588 lr:0.0003000000 time/step:184.98s
|
| 794 |
+
[2025-11-30 05:53:51,033] - step:745/900 train_loss:0.9498 lr:0.0003000000 time/step:183.94s
|
| 795 |
+
[2025-11-30 05:56:57,016] - step:746/900 train_loss:0.9529 lr:0.0003000000 time/step:185.97s
|
| 796 |
+
[2025-11-30 06:00:01,884] - step:747/900 train_loss:0.9376 lr:0.0003000000 time/step:184.84s
|
| 797 |
+
[2025-11-30 06:03:06,392] - step:748/900 train_loss:0.9415 lr:0.0003000000 time/step:184.49s
|
| 798 |
+
[2025-11-30 06:06:13,954] - step:749/900 train_loss:0.9581 lr:0.0003000000 time/step:187.55s
|
| 799 |
+
[2025-11-30 06:09:18,747] - step:750/900 train_loss:0.9494 lr:0.0003000000 time/step:184.77s
|
| 800 |
+
[2025-11-30 06:12:24,279] - step:751/900 train_loss:0.9586 lr:0.0003000000 time/step:185.52s
|
| 801 |
+
[2025-11-30 06:15:30,040] - step:752/900 train_loss:0.9491 lr:0.0003000000 time/step:185.75s
|
| 802 |
+
[2025-11-30 06:18:37,170] - step:753/900 train_loss:0.9585 lr:0.0003000000 time/step:187.12s
|
| 803 |
+
[2025-11-30 06:21:42,398] - step:754/900 train_loss:0.9441 lr:0.0003000000 time/step:185.20s
|
| 804 |
+
[2025-11-30 06:24:48,671] - step:755/900 train_loss:0.9533 lr:0.0003000000 time/step:186.25s
|
| 805 |
+
[2025-11-30 06:27:56,633] - step:756/900 train_loss:0.9433 lr:0.0003000000 time/step:187.94s
|
| 806 |
+
[2025-11-30 06:31:02,691] - step:757/900 train_loss:0.9368 lr:0.0003000000 time/step:186.01s
|
| 807 |
+
[2025-11-30 06:34:08,615] - step:758/900 train_loss:0.9504 lr:0.0003000000 time/step:185.91s
|
| 808 |
+
[2025-11-30 06:37:15,950] - step:759/900 train_loss:0.9412 lr:0.0003000000 time/step:187.31s
|
| 809 |
+
[2025-11-30 06:40:22,539] - step:760/900 train_loss:0.9330 lr:0.0003000000 time/step:186.51s
|
| 810 |
+
[2025-11-30 06:43:28,876] - step:761/900 train_loss:0.9342 lr:0.0003000000 time/step:186.33s
|
| 811 |
+
[2025-11-30 06:46:36,580] - step:762/900 train_loss:0.9329 lr:0.0003000000 time/step:187.68s
|
| 812 |
+
[2025-11-30 06:49:43,404] - step:763/900 train_loss:0.9465 lr:0.0003000000 time/step:186.79s
|
| 813 |
+
[2025-11-30 06:52:49,437] - step:764/900 train_loss:0.9507 lr:0.0003000000 time/step:186.01s
|
| 814 |
+
[2025-11-30 06:55:55,801] - step:765/900 train_loss:0.9754 lr:0.0003000000 time/step:186.35s
|
| 815 |
+
[2025-11-30 06:59:04,165] - step:766/900 train_loss:0.9323 lr:0.0003000000 time/step:188.35s
|
| 816 |
+
[2025-11-30 07:02:09,611] - step:767/900 train_loss:0.9398 lr:0.0003000000 time/step:185.37s
|
| 817 |
+
[2025-11-30 07:05:15,543] - step:768/900 train_loss:0.9773 lr:0.0003000000 time/step:185.92s
|
| 818 |
+
[2025-11-30 07:08:23,040] - step:769/900 train_loss:0.9300 lr:0.0003000000 time/step:187.49s
|
| 819 |
+
[2025-11-30 07:11:27,989] - step:770/900 train_loss:0.9565 lr:0.0003000000 time/step:184.93s
|
| 820 |
+
[2025-11-30 07:14:34,166] - step:771/900 train_loss:0.9791 lr:0.0003000000 time/step:186.17s
|
| 821 |
+
[2025-11-30 07:17:41,334] - step:772/900 train_loss:0.9323 lr:0.0003000000 time/step:187.15s
|
| 822 |
+
[2025-11-30 07:20:48,245] - step:773/900 train_loss:0.9384 lr:0.0003000000 time/step:186.89s
|
| 823 |
+
[2025-11-30 07:23:55,000] - step:774/900 train_loss:0.9620 lr:0.0003000000 time/step:186.75s
|
| 824 |
+
[2025-11-30 07:27:04,805] - step:775/900 train_loss:0.9535 lr:0.0003000000 time/step:189.79s
|
| 825 |
+
[2025-11-30 07:30:11,933] - step:776/900 train_loss:0.9500 lr:0.0003000000 time/step:187.11s
|
| 826 |
+
[2025-11-30 07:33:18,809] - step:777/900 train_loss:0.9556 lr:0.0003000000 time/step:186.84s
|
| 827 |
+
[2025-11-30 07:36:25,083] - step:778/900 train_loss:0.9280 lr:0.0003000000 time/step:186.27s
|
| 828 |
+
[2025-11-30 07:39:34,387] - step:779/900 train_loss:0.9373 lr:0.0003000000 time/step:189.30s
|
| 829 |
+
[2025-11-30 07:42:40,512] - step:780/900 train_loss:0.9556 lr:0.0003000000 time/step:186.10s
|
| 830 |
+
[2025-11-30 07:45:46,330] - step:781/900 train_loss:0.9568 lr:0.0003000000 time/step:185.80s
|
| 831 |
+
[2025-11-30 07:48:53,453] - step:782/900 train_loss:0.9737 lr:0.0003000000 time/step:187.11s
|
| 832 |
+
[2025-11-30 07:51:59,918] - step:783/900 train_loss:0.9267 lr:0.0003000000 time/step:186.44s
|
| 833 |
+
[2025-11-30 07:55:06,653] - step:784/900 train_loss:0.9683 lr:0.0003000000 time/step:186.73s
|
| 834 |
+
[2025-11-30 07:58:14,271] - step:785/900 train_loss:0.9249 lr:0.0003000000 time/step:187.60s
|
| 835 |
+
[2025-11-30 08:01:21,688] - step:786/900 train_loss:0.9586 lr:0.0003000000 time/step:187.32s
|
| 836 |
+
[2025-11-30 08:04:28,087] - step:787/900 train_loss:0.9470 lr:0.0003000000 time/step:186.39s
|
| 837 |
+
[2025-11-30 08:07:34,899] - step:788/900 train_loss:0.9591 lr:0.0003000000 time/step:186.79s
|
| 838 |
+
[2025-11-30 08:10:41,723] - step:789/900 train_loss:0.9433 lr:0.0003000000 time/step:186.81s
|
| 839 |
+
[2025-11-30 08:13:47,670] - step:790/900 train_loss:0.9496 lr:0.0003000000 time/step:185.92s
|
| 840 |
+
[2025-11-30 08:16:53,831] - step:791/900 train_loss:0.9459 lr:0.0003000000 time/step:186.15s
|
| 841 |
+
[2025-11-30 08:20:01,102] - step:792/900 train_loss:0.9601 lr:0.0003000000 time/step:187.25s
|
| 842 |
+
[2025-11-30 08:23:06,763] - step:793/900 train_loss:0.9408 lr:0.0003000000 time/step:185.64s
|
| 843 |
+
[2025-11-30 08:26:12,187] - step:794/900 train_loss:0.9571 lr:0.0003000000 time/step:185.41s
|
| 844 |
+
[2025-11-30 08:29:18,964] - step:795/900 train_loss:0.9670 lr:0.0003000000 time/step:186.77s
|
| 845 |
+
[2025-11-30 08:32:24,852] - step:796/900 train_loss:0.9432 lr:0.0003000000 time/step:185.86s
|
| 846 |
+
[2025-11-30 08:35:30,345] - step:797/900 train_loss:0.9347 lr:0.0003000000 time/step:185.49s
|
| 847 |
+
[2025-11-30 08:38:37,005] - step:798/900 train_loss:0.9431 lr:0.0003000000 time/step:186.65s
|
| 848 |
+
[2025-11-30 08:41:44,291] - step:799/900 train_loss:0.9548 lr:0.0003000000 time/step:187.24s
|
| 849 |
+
[2025-11-30 08:44:52,246] - step:800/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@800.pt...
|
| 850 |
+
[2025-11-30 08:44:52,247] - step:800/900 train_loss:0.9472 lr:0.0003000000 time/step:186.19s
|
| 851 |
+
[2025-11-30 08:47:59,432] - step:801/900 train_loss:0.9580 lr:0.0003000000 time/step:187.17s
|
| 852 |
+
[2025-11-30 08:51:07,677] - step:802/900 train_loss:0.9347 lr:0.0003000000 time/step:188.21s
|
| 853 |
+
[2025-11-30 08:54:13,610] - step:803/900 train_loss:0.9552 lr:0.0003000000 time/step:185.91s
|
| 854 |
+
[2025-11-30 08:57:20,822] - step:804/900 train_loss:0.9433 lr:0.0003000000 time/step:187.20s
|
| 855 |
+
[2025-11-30 09:00:27,788] - step:805/900 train_loss:0.9725 lr:0.0003000000 time/step:186.95s
|
| 856 |
+
[2025-11-30 09:03:33,451] - step:806/900 train_loss:0.9319 lr:0.0003000000 time/step:185.63s
|
| 857 |
+
[2025-11-30 09:06:38,728] - step:807/900 train_loss:0.9416 lr:0.0003000000 time/step:185.26s
|
| 858 |
+
[2025-11-30 09:09:45,243] - step:808/900 train_loss:0.9305 lr:0.0003000000 time/step:186.49s
|
| 859 |
+
[2025-11-30 09:12:51,277] - step:809/900 train_loss:0.9611 lr:0.0003000000 time/step:186.01s
|
| 860 |
+
[2025-11-30 09:15:57,601] - step:810/900 train_loss:0.9333 lr:0.0003000000 time/step:186.32s
|
| 861 |
+
[2025-11-30 09:19:05,758] - step:811/900 train_loss:0.9224 lr:0.0003000000 time/step:188.14s
|
| 862 |
+
[2025-11-30 09:22:13,002] - step:812/900 train_loss:0.9311 lr:0.0003000000 time/step:187.20s
|
| 863 |
+
[2025-11-30 09:25:19,895] - step:813/900 train_loss:0.9344 lr:0.0003000000 time/step:186.89s
|
| 864 |
+
[2025-11-30 09:28:27,220] - step:814/900 train_loss:0.9558 lr:0.0003000000 time/step:187.31s
|
| 865 |
+
[2025-11-30 09:31:34,977] - step:815/900 train_loss:0.9603 lr:0.0003000000 time/step:187.65s
|
| 866 |
+
[2025-11-30 09:34:41,403] - step:816/900 train_loss:0.9405 lr:0.0003000000 time/step:186.42s
|
| 867 |
+
[2025-11-30 09:37:48,995] - step:817/900 train_loss:0.9620 lr:0.0003000000 time/step:187.58s
|
| 868 |
+
[2025-11-30 09:40:56,468] - step:818/900 train_loss:0.9415 lr:0.0003000000 time/step:187.44s
|
| 869 |
+
[2025-11-30 09:44:02,893] - step:819/900 train_loss:0.9391 lr:0.0003000000 time/step:186.41s
|
| 870 |
+
[2025-11-30 09:47:08,977] - step:820/900 train_loss:0.9551 lr:0.0003000000 time/step:186.08s
|
| 871 |
+
[2025-11-30 09:50:18,823] - step:821/900 train_loss:0.9585 lr:0.0003000000 time/step:189.84s
|
| 872 |
+
[2025-11-30 09:53:24,812] - step:822/900 train_loss:0.9449 lr:0.0003000000 time/step:185.94s
|
| 873 |
+
[2025-11-30 09:56:30,627] - step:823/900 train_loss:0.9446 lr:0.0003000000 time/step:185.81s
|
| 874 |
+
[2025-11-30 09:59:38,002] - step:824/900 train_loss:0.9589 lr:0.0003000000 time/step:187.37s
|
| 875 |
+
[2025-11-30 10:02:45,105] - step:825/900 train_loss:0.9660 lr:0.0003000000 time/step:187.08s
|
| 876 |
+
[2025-11-30 10:05:50,874] - step:826/900 train_loss:0.9403 lr:0.0003000000 time/step:185.75s
|
| 877 |
+
[2025-11-30 10:08:57,938] - step:827/900 train_loss:0.9435 lr:0.0003000000 time/step:187.06s
|
| 878 |
+
[2025-11-30 10:12:06,575] - step:828/900 train_loss:0.9462 lr:0.0003000000 time/step:188.61s
|
| 879 |
+
[2025-11-30 10:15:12,880] - step:829/900 train_loss:0.9383 lr:0.0003000000 time/step:186.30s
|
| 880 |
+
[2025-11-30 10:18:19,887] - step:830/900 train_loss:0.9513 lr:0.0003000000 time/step:187.00s
|
| 881 |
+
[2025-11-30 10:21:26,611] - step:831/900 train_loss:0.9434 lr:0.0003000000 time/step:186.71s
|
| 882 |
+
[2025-11-30 10:24:32,173] - step:832/900 train_loss:0.9277 lr:0.0003000000 time/step:185.55s
|
| 883 |
+
[2025-11-30 10:27:38,320] - step:833/900 train_loss:0.9638 lr:0.0003000000 time/step:186.13s
|
| 884 |
+
[2025-11-30 10:30:45,530] - step:834/900 train_loss:0.9344 lr:0.0003000000 time/step:187.17s
|
| 885 |
+
[2025-11-30 10:33:51,305] - step:835/900 train_loss:0.9318 lr:0.0003000000 time/step:185.76s
|
| 886 |
+
[2025-11-30 10:36:56,815] - step:836/900 train_loss:0.9660 lr:0.0003000000 time/step:185.50s
|
| 887 |
+
[2025-11-30 10:40:03,772] - step:837/900 train_loss:0.9189 lr:0.0003000000 time/step:186.95s
|
| 888 |
+
[2025-11-30 10:43:10,385] - step:838/900 train_loss:0.9294 lr:0.0003000000 time/step:186.60s
|
| 889 |
+
[2025-11-30 10:46:16,106] - step:839/900 train_loss:0.9562 lr:0.0003000000 time/step:185.71s
|
| 890 |
+
[2025-11-30 10:49:23,701] - step:840/900 train_loss:0.9308 lr:0.0003000000 time/step:187.59s
|
| 891 |
+
[2025-11-30 10:52:31,109] - step:841/900 train_loss:0.9446 lr:0.0003000000 time/step:187.37s
|
| 892 |
+
[2025-11-30 10:55:36,766] - step:842/900 train_loss:0.9646 lr:0.0003000000 time/step:185.64s
|
| 893 |
+
[2025-11-30 10:58:43,991] - step:843/900 train_loss:0.9662 lr:0.0003000000 time/step:187.22s
|
| 894 |
+
[2025-11-30 11:01:51,907] - step:844/900 train_loss:0.9557 lr:0.0003000000 time/step:187.91s
|
| 895 |
+
[2025-11-30 11:04:58,972] - step:845/900 train_loss:0.9409 lr:0.0003000000 time/step:187.04s
|
| 896 |
+
[2025-11-30 11:08:05,417] - step:846/900 train_loss:0.9277 lr:0.0003000000 time/step:186.44s
|
| 897 |
+
[2025-11-30 11:11:12,807] - step:847/900 train_loss:0.9310 lr:0.0003000000 time/step:187.37s
|
| 898 |
+
[2025-11-30 11:14:18,599] - step:848/900 train_loss:0.9528 lr:0.0003000000 time/step:185.78s
|
| 899 |
+
[2025-11-30 11:17:24,283] - step:849/900 train_loss:0.9435 lr:0.0003000000 time/step:185.67s
|
| 900 |
+
[2025-11-30 11:20:29,958] - step:850/900 train_loss:0.9328 lr:0.0003000000 time/step:185.67s
|
| 901 |
+
[2025-11-30 11:23:37,238] - step:851/900 train_loss:0.9586 lr:0.0003000000 time/step:187.25s
|
| 902 |
+
[2025-11-30 11:26:43,509] - step:852/900 train_loss:0.9788 lr:0.0003000000 time/step:186.26s
|
| 903 |
+
[2025-11-30 11:29:50,577] - step:853/900 train_loss:0.9598 lr:0.0003000000 time/step:187.04s
|
| 904 |
+
[2025-11-30 11:32:58,592] - step:854/900 train_loss:0.9314 lr:0.0003000000 time/step:187.98s
|
| 905 |
+
[2025-11-30 11:36:05,368] - step:855/900 train_loss:0.9431 lr:0.0003000000 time/step:186.76s
|
| 906 |
+
[2025-11-30 11:39:14,068] - step:856/900 train_loss:0.9402 lr:0.0003000000 time/step:188.69s
|
| 907 |
+
[2025-11-30 11:42:21,149] - step:857/900 train_loss:0.9406 lr:0.0003000000 time/step:187.03s
|
| 908 |
+
[2025-11-30 11:45:27,269] - step:858/900 train_loss:0.9517 lr:0.0003000000 time/step:186.10s
|
| 909 |
+
[2025-11-30 11:48:33,589] - step:859/900 train_loss:0.9288 lr:0.0003000000 time/step:186.29s
|
| 910 |
+
[2025-11-30 11:51:41,526] - step:860/900 train_loss:0.9489 lr:0.0003000000 time/step:187.92s
|
| 911 |
+
[2025-11-30 11:54:48,310] - step:861/900 train_loss:0.9242 lr:0.0003000000 time/step:186.76s
|
| 912 |
+
[2025-11-30 11:57:55,433] - step:862/900 train_loss:0.9465 lr:0.0003000000 time/step:187.12s
|
| 913 |
+
[2025-11-30 12:01:02,214] - step:863/900 train_loss:0.9319 lr:0.0003000000 time/step:186.77s
|
| 914 |
+
[2025-11-30 12:04:10,157] - step:864/900 train_loss:0.9561 lr:0.0003000000 time/step:187.93s
|
| 915 |
+
[2025-11-30 12:07:16,580] - step:865/900 train_loss:0.9531 lr:0.0003000000 time/step:186.41s
|
| 916 |
+
[2025-11-30 12:10:24,225] - step:866/900 train_loss:0.9716 lr:0.0003000000 time/step:187.64s
|
| 917 |
+
[2025-11-30 12:13:32,116] - step:867/900 train_loss:0.9523 lr:0.0003000000 time/step:187.86s
|
| 918 |
+
[2025-11-30 12:16:38,751] - step:868/900 train_loss:0.9485 lr:0.0003000000 time/step:186.63s
|
| 919 |
+
[2025-11-30 12:19:46,482] - step:869/900 train_loss:0.9338 lr:0.0003000000 time/step:187.71s
|
| 920 |
+
[2025-11-30 12:22:55,768] - step:870/900 train_loss:0.9071 lr:0.0003000000 time/step:189.25s
|
| 921 |
+
[2025-11-30 12:26:03,017] - step:871/900 train_loss:0.9349 lr:0.0003000000 time/step:187.24s
|
| 922 |
+
[2025-11-30 12:29:11,277] - step:872/900 train_loss:0.9171 lr:0.0003000000 time/step:188.25s
|
| 923 |
+
[2025-11-30 12:32:17,975] - step:873/900 train_loss:0.9291 lr:0.0003000000 time/step:186.69s
|
| 924 |
+
[2025-11-30 12:35:23,652] - step:874/900 train_loss:0.9496 lr:0.0003000000 time/step:185.66s
|
| 925 |
+
[2025-11-30 12:38:30,634] - step:875/900 train_loss:0.9004 lr:0.0003000000 time/step:186.96s
|
| 926 |
+
[2025-11-30 12:41:36,335] - step:876/900 train_loss:0.9638 lr:0.0003000000 time/step:185.69s
|
| 927 |
+
[2025-11-30 12:44:43,698] - step:877/900 train_loss:0.9303 lr:0.0003000000 time/step:187.35s
|
| 928 |
+
[2025-11-30 12:47:49,883] - step:878/900 train_loss:0.9308 lr:0.0003000000 time/step:186.17s
|
| 929 |
+
[2025-11-30 12:50:57,119] - step:879/900 train_loss:0.9567 lr:0.0003000000 time/step:187.23s
|
| 930 |
+
[2025-11-30 12:54:05,570] - step:880/900 train_loss:0.9294 lr:0.0003000000 time/step:188.35s
|
| 931 |
+
[2025-11-30 12:57:11,908] - step:881/900 train_loss:0.9243 lr:0.0003000000 time/step:186.32s
|
| 932 |
+
[2025-11-30 13:00:18,512] - step:882/900 train_loss:0.9372 lr:0.0003000000 time/step:186.59s
|
| 933 |
+
[2025-11-30 13:03:25,956] - step:883/900 train_loss:0.9677 lr:0.0003000000 time/step:187.41s
|
| 934 |
+
[2025-11-30 13:06:32,030] - step:884/900 train_loss:0.9502 lr:0.0003000000 time/step:186.05s
|
| 935 |
+
[2025-11-30 13:09:38,746] - step:885/900 train_loss:0.9309 lr:0.0003000000 time/step:186.69s
|
| 936 |
+
[2025-11-30 13:12:45,777] - step:886/900 train_loss:0.9468 lr:0.0003000000 time/step:186.96s
|
| 937 |
+
[2025-11-30 13:15:51,552] - step:887/900 train_loss:0.9319 lr:0.0003000000 time/step:185.76s
|
| 938 |
+
[2025-11-30 13:18:58,191] - step:888/900 train_loss:0.9400 lr:0.0003000000 time/step:186.63s
|
| 939 |
+
[2025-11-30 13:22:04,951] - step:889/900 train_loss:0.9518 lr:0.0003000000 time/step:186.75s
|
| 940 |
+
[2025-11-30 13:25:13,370] - step:890/900 train_loss:0.9375 lr:0.0003000000 time/step:188.37s
|
| 941 |
+
[2025-11-30 13:28:19,675] - step:891/900 train_loss:0.9699 lr:0.0003000000 time/step:186.29s
|
| 942 |
+
[2025-11-30 13:31:27,143] - step:892/900 train_loss:0.9479 lr:0.0003000000 time/step:187.46s
|
| 943 |
+
[2025-11-30 13:34:34,338] - step:893/900 train_loss:0.9351 lr:0.0003000000 time/step:187.14s
|
| 944 |
+
[2025-11-30 13:37:40,472] - step:894/900 train_loss:0.9767 lr:0.0003000000 time/step:186.13s
|
| 945 |
+
[2025-11-30 13:40:48,083] - step:895/900 train_loss:0.9475 lr:0.0003000000 time/step:187.60s
|
| 946 |
+
[2025-11-30 13:43:55,818] - step:896/900 train_loss:0.9617 lr:0.0003000000 time/step:187.71s
|
| 947 |
+
[2025-11-30 13:47:01,872] - step:897/900 train_loss:0.9549 lr:0.0003000000 time/step:186.04s
|
| 948 |
+
[2025-11-30 13:50:09,170] - step:898/900 train_loss:0.9324 lr:0.0003000000 time/step:187.29s
|
| 949 |
+
[2025-11-30 13:53:18,377] - step:899/900 train_loss:0.9573 lr:0.0003000000 time/step:189.17s
|
| 950 |
+
[2025-11-30 13:56:26,957] - step:900/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/model@900.pt...
|
| 951 |
+
[2025-11-30 13:56:26,959] - step:900/900 train_loss:0.9387 lr:0.0003000000 time/step:186.63s
|
wandb/run-20251128_151948-j8dmy8fe/files/requirements.txt
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
parso==0.8.4
|
| 2 |
+
pydantic_core==2.27.2
|
| 3 |
+
charset-normalizer==3.4.1
|
| 4 |
+
xxhash==3.5.0
|
| 5 |
+
PyYAML==6.0.2
|
| 6 |
+
transformers==4.49.0
|
| 7 |
+
idna==3.10
|
| 8 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 9 |
+
datasets==4.0.0
|
| 10 |
+
numpy==2.2.3
|
| 11 |
+
hydra-core==1.3.2
|
| 12 |
+
Pygments==2.19.1
|
| 13 |
+
rich==14.0.0
|
| 14 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 15 |
+
urllib3==2.3.0
|
| 16 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 17 |
+
contourpy==1.3.1
|
| 18 |
+
cycler==0.12.1
|
| 19 |
+
decorator==5.2.1
|
| 20 |
+
psutil==7.0.0
|
| 21 |
+
aiohttp==3.11.13
|
| 22 |
+
einops==0.8.1
|
| 23 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 24 |
+
exceptiongroup==1.2.2
|
| 25 |
+
stack-data==0.6.3
|
| 26 |
+
setproctitle==1.3.5
|
| 27 |
+
fsspec==2024.12.0
|
| 28 |
+
tueplots==0.2.0
|
| 29 |
+
pexpect==4.9.0
|
| 30 |
+
gitdb==4.0.12
|
| 31 |
+
fonttools==4.56.0
|
| 32 |
+
ipython==8.35.0
|
| 33 |
+
huggingface-hub==0.29.2
|
| 34 |
+
filelock==3.17.0
|
| 35 |
+
torchvision==0.21.0+cu124
|
| 36 |
+
platformdirs==4.3.6
|
| 37 |
+
peft==0.15.1
|
| 38 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 39 |
+
wandb==0.19.8
|
| 40 |
+
click==8.1.8
|
| 41 |
+
mpmath==1.3.0
|
| 42 |
+
Jinja2==3.1.6
|
| 43 |
+
scipy==1.14.1
|
| 44 |
+
markdown-it-py==3.0.0
|
| 45 |
+
matplotlib-inline==0.1.7
|
| 46 |
+
wheel==0.45.1
|
| 47 |
+
setuptools==75.8.2
|
| 48 |
+
tqdm==4.67.1
|
| 49 |
+
antlr4-python3-runtime==4.9.3
|
| 50 |
+
deepspeed==0.16.7
|
| 51 |
+
omegaconf==2.3.0
|
| 52 |
+
torchaudio==2.6.0+cu124
|
| 53 |
+
aiosignal==1.3.2
|
| 54 |
+
accelerate==1.6.0
|
| 55 |
+
py-cpuinfo==9.0.0
|
| 56 |
+
pyparsing==3.2.1
|
| 57 |
+
ninja==1.11.1.4
|
| 58 |
+
pandas==2.2.3
|
| 59 |
+
six==1.17.0
|
| 60 |
+
wcwidth==0.2.13
|
| 61 |
+
safetensors==0.5.3
|
| 62 |
+
attrs==25.1.0
|
| 63 |
+
python-dateutil==2.9.0.post0
|
| 64 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 65 |
+
multiprocess==0.70.16
|
| 66 |
+
seaborn==0.13.2
|
| 67 |
+
networkx==3.4.2
|
| 68 |
+
regex==2024.11.6
|
| 69 |
+
nvidia-nvtx-cu12==12.4.127
|
| 70 |
+
tokenizers==0.21.0
|
| 71 |
+
nvidia-curand-cu12==10.3.5.147
|
| 72 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 73 |
+
MarkupSafe==3.0.2
|
| 74 |
+
triton==3.1.0
|
| 75 |
+
pip==25.0.1
|
| 76 |
+
jedi==0.19.2
|
| 77 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 78 |
+
iniconfig==2.0.0
|
| 79 |
+
pluggy==1.5.0
|
| 80 |
+
langdetect==1.0.9
|
| 81 |
+
pure_eval==0.2.3
|
| 82 |
+
docker-pycreds==0.4.0
|
| 83 |
+
libcirkit==0.2.1
|
| 84 |
+
mdurl==0.1.2
|
| 85 |
+
annotated-types==0.7.0
|
| 86 |
+
sentry-sdk==2.22.0
|
| 87 |
+
executing==2.2.0
|
| 88 |
+
pydantic==2.10.6
|
| 89 |
+
opt_einsum==3.4.0
|
| 90 |
+
pytz==2025.1
|
| 91 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 92 |
+
protobuf==5.29.3
|
| 93 |
+
requests==2.32.3
|
| 94 |
+
tomli==2.2.1
|
| 95 |
+
matplotlib==3.10.1
|
| 96 |
+
hjson==3.1.0
|
| 97 |
+
frozenlist==1.5.0
|
| 98 |
+
pillow==11.1.0
|
| 99 |
+
GitPython==3.1.44
|
| 100 |
+
typing_extensions==4.12.2
|
| 101 |
+
pyarrow==19.0.1
|
| 102 |
+
propcache==0.3.0
|
| 103 |
+
prompt_toolkit==3.0.51
|
| 104 |
+
torch==2.6.0+cu124
|
| 105 |
+
async-timeout==5.0.1
|
| 106 |
+
bitsandbytes==0.45.5
|
| 107 |
+
trl==0.16.1
|
| 108 |
+
ptyprocess==0.7.0
|
| 109 |
+
dill==0.3.8
|
| 110 |
+
pytest==8.3.5
|
| 111 |
+
nvidia-nccl-cu12==2.21.5
|
| 112 |
+
sympy==1.13.1
|
| 113 |
+
flash_attn==2.7.4.post1
|
| 114 |
+
certifi==2025.1.31
|
| 115 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 116 |
+
tzdata==2025.1
|
| 117 |
+
aiohappyeyeballs==2.5.0
|
| 118 |
+
msgpack==1.1.0
|
| 119 |
+
traitlets==5.14.3
|
| 120 |
+
multidict==6.1.0
|
| 121 |
+
packaging==24.2
|
| 122 |
+
kiwisolver==1.4.8
|
| 123 |
+
smmap==5.0.2
|
| 124 |
+
asttokens==3.0.0
|
| 125 |
+
yarl==1.18.3
|
| 126 |
+
graphviz==0.20.3
|
wandb/run-20251128_151948-j8dmy8fe/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-57-generic-x86_64-with-glibc2.39",
|
| 3 |
+
"python": "CPython 3.10.16",
|
| 4 |
+
"startedAt": "2025-11-28T15:19:48.470090Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"data=tulu3-llama3-packed",
|
| 7 |
+
"training=tulu3-evabyte-1epoch",
|
| 8 |
+
"lm=llama3-2-3b-byte",
|
| 9 |
+
"model=mtp",
|
| 10 |
+
"adaptor=none",
|
| 11 |
+
"mt_head=linear-evabyte",
|
| 12 |
+
"circuit=btree",
|
| 13 |
+
"circuit.n_token=8",
|
| 14 |
+
"circuit.n_component=32",
|
| 15 |
+
"circuit.n_repetition=1",
|
| 16 |
+
"training.device_batch_size=1",
|
| 17 |
+
"model.model.beta=0",
|
| 18 |
+
"model.model.gamma=0.9",
|
| 19 |
+
"data.val_bin=null",
|
| 20 |
+
"training.learning_rate=0.0003",
|
| 21 |
+
"training.expname=llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1"
|
| 22 |
+
],
|
| 23 |
+
"program": "-m mtp.train",
|
| 24 |
+
"git": {
|
| 25 |
+
"remote": "git@github.com:PiotrNawrot/nanoGPT.git",
|
| 26 |
+
"commit": "348442692ab18a9196652fdb2c860734ac87a6f4"
|
| 27 |
+
},
|
| 28 |
+
"email": "agrivas@inf.ed.ac.uk",
|
| 29 |
+
"root": "/disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34",
|
| 30 |
+
"host": "scotia04.inf.ed.ac.uk",
|
| 31 |
+
"executable": "/home/agrivas/nanoGPT/.venv/bin/python3",
|
| 32 |
+
"cpu_count": 24,
|
| 33 |
+
"cpu_count_logical": 48,
|
| 34 |
+
"gpu": "NVIDIA L40S",
|
| 35 |
+
"gpu_count": 2,
|
| 36 |
+
"disk": {
|
| 37 |
+
"/": {
|
| 38 |
+
"total": "184643391488",
|
| 39 |
+
"used": "38313848832"
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"memory": {
|
| 43 |
+
"total": "540522954752"
|
| 44 |
+
},
|
| 45 |
+
"cpu": {
|
| 46 |
+
"count": 24,
|
| 47 |
+
"countLogical": 48
|
| 48 |
+
},
|
| 49 |
+
"gpu_nvidia": [
|
| 50 |
+
{
|
| 51 |
+
"name": "NVIDIA L40S",
|
| 52 |
+
"memoryTotal": "48305799168",
|
| 53 |
+
"cudaCores": 18176,
|
| 54 |
+
"architecture": "Ada"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA L40S",
|
| 58 |
+
"memoryTotal": "48305799168",
|
| 59 |
+
"cudaCores": 18176,
|
| 60 |
+
"architecture": "Ada"
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
+
"slurm": {
|
| 64 |
+
"cluster_name": "landoniacluster",
|
| 65 |
+
"conf": "/etc/slurm/slurm.conf",
|
| 66 |
+
"cpus_on_node": "12",
|
| 67 |
+
"cpus_per_gpu": "6",
|
| 68 |
+
"gpus_on_node": "2",
|
| 69 |
+
"gtids": "0",
|
| 70 |
+
"job_account": "research-staff",
|
| 71 |
+
"job_cpus_per_node": "12",
|
| 72 |
+
"job_end_time": "1764774672",
|
| 73 |
+
"job_gid": "10000",
|
| 74 |
+
"job_gpus": "1,3",
|
| 75 |
+
"job_id": "2137456",
|
| 76 |
+
"job_name": "slurm.sh",
|
| 77 |
+
"job_nodelist": "scotia04",
|
| 78 |
+
"job_num_nodes": "1",
|
| 79 |
+
"job_partition": "PGR-Standard",
|
| 80 |
+
"job_qos": "normal",
|
| 81 |
+
"job_start_time": "1764342672",
|
| 82 |
+
"job_uid": "1782564",
|
| 83 |
+
"job_user": "agrivas",
|
| 84 |
+
"jobid": "2137456",
|
| 85 |
+
"localid": "0",
|
| 86 |
+
"mem_per_node": "48000",
|
| 87 |
+
"nnodes": "1",
|
| 88 |
+
"nodeid": "0",
|
| 89 |
+
"nodelist": "scotia04",
|
| 90 |
+
"nprocs": "1",
|
| 91 |
+
"ntasks": "1",
|
| 92 |
+
"prio_process": "0",
|
| 93 |
+
"procid": "0",
|
| 94 |
+
"submit_dir": "/home/agrivas",
|
| 95 |
+
"submit_host": "hastings.inf.ed.ac.uk",
|
| 96 |
+
"task_pid": "3728705",
|
| 97 |
+
"tasks_per_node": "1",
|
| 98 |
+
"topology_addr": "scotia04",
|
| 99 |
+
"topology_addr_pattern": "node"
|
| 100 |
+
},
|
| 101 |
+
"cudaVersion": "12.8"
|
| 102 |
+
}
|
wandb/run-20251128_151948-j8dmy8fe/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_step":899,"_runtime":167798.529724435,"train/ce_loss_at_5":1.5002950429916382,"train/ce_loss_at_1":0.23747298121452332,"_timestamp":1.7645109869604163e+09,"_wandb":{"runtime":167800},"train/ce_loss_at_4":0.9499043822288513,"global_step":900,"train/ce_loss_at_8":1.6754989624023438,"train/ce_loss_at_2":0.41758430004119873,"train/ce_loss_at_3":0.724645733833313,"train/ce_loss_at_7":1.6040127277374268,"train/loss":0.938701868057251,"train/ce_loss_at_6":1.3807265758514404}
|
wandb/run-20251128_151948-j8dmy8fe/logs/debug-core.log
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-11-28T15:19:45.880791352Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpio8m7k5j/port-3738330.txt","pid":3738330,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
| 2 |
+
{"time":"2025-11-28T15:19:45.884567829Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":3738330}
|
| 3 |
+
{"time":"2025-11-28T15:19:45.884391029Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35531,"Zone":""}}
|
| 4 |
+
{"time":"2025-11-28T15:19:46.014595986Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:41260"}
|
| 5 |
+
{"time":"2025-11-28T15:19:48.474827439Z","level":"INFO","msg":"handleInformInit: received","streamId":"j8dmy8fe","id":"127.0.0.1:41260"}
|
| 6 |
+
{"time":"2025-11-28T15:19:48.699110621Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"j8dmy8fe","id":"127.0.0.1:41260"}
|
| 7 |
+
{"time":"2025-11-30T13:56:28.649444866Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:41260"}
|
| 8 |
+
{"time":"2025-11-30T13:56:28.751426764Z","level":"INFO","msg":"server is shutting down"}
|
| 9 |
+
{"time":"2025-11-30T13:56:28.765190328Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:41260"}
|
| 10 |
+
{"time":"2025-11-30T13:56:28.770779415Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:41260"}
|
| 11 |
+
{"time":"2025-11-30T13:56:30.60432487Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:41260"}
|
| 12 |
+
{"time":"2025-11-30T13:56:30.610007207Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:41260"}
|
| 13 |
+
{"time":"2025-11-30T13:56:30.61641628Z","level":"INFO","msg":"server is closed"}
|
wandb/run-20251128_151948-j8dmy8fe/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-11-28T15:19:48.477578816Z","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-11-28T15:19:48.699009069Z","level":"INFO","msg":"created new stream","id":"j8dmy8fe"}
|
| 3 |
+
{"time":"2025-11-28T15:19:48.699097403Z","level":"INFO","msg":"stream: started","id":"j8dmy8fe"}
|
| 4 |
+
{"time":"2025-11-28T15:19:48.699172779Z","level":"INFO","msg":"writer: Do: started","stream_id":"j8dmy8fe"}
|
| 5 |
+
{"time":"2025-11-28T15:19:48.699248275Z","level":"INFO","msg":"handler: started","stream_id":"j8dmy8fe"}
|
| 6 |
+
{"time":"2025-11-28T15:19:48.699271222Z","level":"INFO","msg":"sender: started","stream_id":"j8dmy8fe"}
|
| 7 |
+
{"time":"2025-11-28T15:19:49.160995748Z","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-11-28T20:38:35.352286792Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 9 |
+
{"time":"2025-11-30T13:56:28.774510068Z","level":"INFO","msg":"stream: closing","id":"j8dmy8fe"}
|
| 10 |
+
{"time":"2025-11-30T13:56:28.784684257Z","level":"INFO","msg":"Stopping system monitor"}
|
| 11 |
+
{"time":"2025-11-30T13:56:29.045992888Z","level":"INFO","msg":"Stopped system monitor"}
|
| 12 |
+
{"time":"2025-11-30T13:56:30.296345529Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 13 |
+
{"time":"2025-11-30T13:56:30.566218828Z","level":"INFO","msg":"handler: closed","stream_id":"j8dmy8fe"}
|
| 14 |
+
{"time":"2025-11-30T13:56:30.566263739Z","level":"INFO","msg":"writer: Close: closed","stream_id":"j8dmy8fe"}
|
| 15 |
+
{"time":"2025-11-30T13:56:30.573657463Z","level":"INFO","msg":"sender: closed","stream_id":"j8dmy8fe"}
|
| 16 |
+
{"time":"2025-11-30T13:56:30.573732716Z","level":"INFO","msg":"stream: closed","id":"j8dmy8fe"}
|
wandb/run-20251128_151948-j8dmy8fe/logs/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Configure stats pid to 3738330
|
| 3 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from /home/agrivas/.config/wandb/settings
|
| 4 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/settings
|
| 5 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug.log
|
| 7 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /disk/scratch/agrivas/nanoGPT/logs/2025-11-28/15-16-34/wandb/run-20251128_151948-j8dmy8fe/logs/debug-internal.log
|
| 8 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-11-28 15:19:48,444 INFO MainThread:3738330 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'compile': True, 'device': 'cuda', 'from_checkpoint': None, 'load_mtp_head_from_model': None, 'name': 'nanogpt', 'training': {'random_seed': 13, 'batch_size': 256, 'device_batch_size': 1, 'sequence_length': 8192, 'num_iterations': 900, 'learning_rate': 0.0003, 'use_scheduler': False, 'save_model': True, 'save_optimizer': True, 'save_model_every': 100, 'val_loss_every': 100, 'val_tokens': 4194304, 'expname': 'llama-lr-3e-4-no-lora-btree-n-8-r-32-s-1'}, 'model': {'name': 'mtp', 'beta': 0.0, 'gamma': 1, 'kl_algorithm': 'full', 'kl_type': 'forward', 'model': {'_target_': 'mtp.models.mtp.MultiTokenLM', 'lm': '${lm.model}', 'circuit': '${circuit.model}', 'mt_head_kwargs': '${mt_head.hyperparameters}', 'init_from_lm_head': True, 'kl_type': '${model.kl_type}', 'kl_algorithm': '${model.kl_algorithm}', 'beta': 0, 'gamma': 0.9}}, 'circuit': {'name': 'btree', 'n_token': 8, 'n_component': 32, 'n_repetition': 1, 'model': {'_target_': 'mtp.models.circuits.CircuitModel', 'vocab_size': 268, 'n_token': 8, 'n_component': 32, 'n_repetition': 1, 'kind': 'btree'}}, 'mt_head': {'name': 'linear-evabyte', 'hyperparameters': {'type': 'evabyte', 'n_embd': 3072, 'transformer_n_head': 24, 'transformer_n_layer': 0, 'expander_type': 'linear', 'expander_n_layer': 1, 'freeze_vocab_unembedding': False, 'share_sum_weights': False, 'contextual_hmm_weights': True, 'init_hmm_identity': True}}, 'adaptor': {'name': 'none', 'hyperparameters': None}, 'lm': {'name': 'llama3-2-3b-byte', 'n_embd': 3072, 'n_head': 24, 'model': {'_target_': 'mtp.models.lm.LM', 'lm': None, 'encoder_only': True, 'from_checkpoint': None, 'from_huggingface': 'benjamin/Llama3-2-3B-IT-Byte', 'adaptor_kwargs': None, 'ref_enc': 'model', 'ref_head': 'lm_head', 'freeze': True}}, 'data': {'name': 'tulu3-llama3', 'train_bin': 'agrv/tulu-v3-sft-llama3-packed-seq-len-8192', 'val_bin': None, 'vocab_size': 268}, 'generate': {'speculative': False}, '_wandb': {}}
|
| 11 |
+
2025-11-28 15:19:48,445 INFO MainThread:3738330 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-11-28 15:19:48,445 INFO MainThread:3738330 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-11-28 15:19:48,469 INFO MainThread:3738330 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-11-28 15:19:48,469 INFO MainThread:3738330 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-11-28 15:19:48,474 INFO MainThread:3738330 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-11-28 15:19:48,496 INFO MainThread:3738330 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-11-28 15:19:49,156 INFO MainThread:3738330 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-11-28 15:19:49,910 INFO MainThread:3738330 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-11-28 15:19:49,910 INFO MainThread:3738330 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-11-28 15:19:49,915 INFO MainThread:3738330 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-11-28 15:19:49,915 INFO MainThread:3738330 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-11-28 15:19:49,940 INFO MainThread:3738330 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-11-30 13:56:28,347 INFO MsgRouterThr:3738330 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
|
wandb/run-20251128_151948-j8dmy8fe/run-j8dmy8fe.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc008877fb114dd4d807eac90e2568c72b6bae32af4a1f66cd94af23416db3f2
|
| 3 |
+
size 14599238
|