jekunz commited on Feb 18

Commit

3fe07f0

verified ·

1 Parent(s): 86f972a

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

checkpoint-1102/config.json +32 -0
checkpoint-1102/generation_config.json +6 -0
checkpoint-1102/model.safetensors +3 -0
checkpoint-1102/optimizer.pt +3 -0
checkpoint-1102/rng_state.pth +3 -0
checkpoint-1102/scaler.pt +3 -0
checkpoint-1102/scheduler.pt +3 -0
checkpoint-1102/trainer_state.json +119 -0
checkpoint-1102/training_args.bin +3 -0
checkpoint-11020/config.json +32 -0
checkpoint-11020/generation_config.json +6 -0
checkpoint-11020/model.safetensors +3 -0
checkpoint-11020/optimizer.pt +3 -0
checkpoint-11020/rng_state.pth +3 -0
checkpoint-11020/scaler.pt +3 -0
checkpoint-11020/scheduler.pt +3 -0
checkpoint-11020/trainer_state.json +884 -0
checkpoint-11020/training_args.bin +3 -0
checkpoint-2204/config.json +32 -0
checkpoint-2204/generation_config.json +6 -0
checkpoint-2204/model.safetensors +3 -0
checkpoint-2204/optimizer.pt +3 -0
checkpoint-2204/rng_state.pth +3 -0
checkpoint-2204/scaler.pt +3 -0
checkpoint-2204/scheduler.pt +3 -0
checkpoint-2204/trainer_state.json +204 -0
checkpoint-2204/training_args.bin +3 -0
checkpoint-3306/config.json +32 -0
checkpoint-3306/generation_config.json +6 -0
checkpoint-3306/model.safetensors +3 -0
checkpoint-3306/optimizer.pt +3 -0
checkpoint-3306/rng_state.pth +3 -0
checkpoint-3306/scaler.pt +3 -0
checkpoint-3306/scheduler.pt +3 -0
checkpoint-3306/trainer_state.json +289 -0
checkpoint-3306/training_args.bin +3 -0
checkpoint-4408/config.json +32 -0
checkpoint-4408/generation_config.json +6 -0
checkpoint-4408/model.safetensors +3 -0
checkpoint-4408/optimizer.pt +3 -0
checkpoint-4408/rng_state.pth +3 -0
checkpoint-4408/scaler.pt +3 -0
checkpoint-4408/scheduler.pt +3 -0
checkpoint-4408/trainer_state.json +374 -0
checkpoint-4408/training_args.bin +3 -0
checkpoint-5510/config.json +32 -0
checkpoint-5510/generation_config.json +6 -0
checkpoint-5510/model.safetensors +3 -0
checkpoint-5510/optimizer.pt +3 -0
checkpoint-5510/rng_state.pth +3 -0

checkpoint-1102/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 512,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": 3072,
+  "n_layer": 12,
+  "n_positions": 512,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.1",
+  "use_cache": true,
+  "vocab_size": 50000
+}

checkpoint-1102/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.55.1"
+}

checkpoint-1102/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4664e940e6525e7580eed0b411b0219793a2d7f1414b87ab4faf26ac07ccdb76
+size 495411840

checkpoint-1102/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fa467dc354c344596db4b11c98be257e84aca9d180b09a5a17ec127d0d59d60
+size 990920075

checkpoint-1102/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:980a1b68c7dbf538e6ffcfed871f2ea00501e6597777d59ebeacf240c8e404df
+size 14645

checkpoint-1102/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d890ab2507c2df10293362b044d4f23e8ea33efa0473aacef3746451fff4942a
+size 1383

checkpoint-1102/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cde6d27b68e1b876a90c26a3caabdaeffd87c0b46c4f3d8df07cd4891b14659
+size 1465

checkpoint-1102/trainer_state.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1102,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09074410163339383,
+      "grad_norm": 1.4687509536743164,
+      "learning_rate": 8.98366606170599e-06,
+      "loss": 6.9172,
+      "step": 100
+    },
+    {
+      "epoch": 0.18148820326678766,
+      "grad_norm": 1.268220067024231,
+      "learning_rate": 1.8058076225045372e-05,
+      "loss": 5.2198,
+      "step": 200
+    },
+    {
+      "epoch": 0.27223230490018147,
+      "grad_norm": 0.9486992955207825,
+      "learning_rate": 2.7132486388384752e-05,
+      "loss": 4.6092,
+      "step": 300
+    },
+    {
+      "epoch": 0.3629764065335753,
+      "grad_norm": 0.8284947276115417,
+      "learning_rate": 3.620689655172414e-05,
+      "loss": 4.2682,
+      "step": 400
+    },
+    {
+      "epoch": 0.4537205081669691,
+      "grad_norm": 0.6980849504470825,
+      "learning_rate": 4.528130671506352e-05,
+      "loss": 4.2027,
+      "step": 500
+    },
+    {
+      "epoch": 0.5444646098003629,
+      "grad_norm": 0.9210988879203796,
+      "learning_rate": 5.435571687840291e-05,
+      "loss": 3.9613,
+      "step": 600
+    },
+    {
+      "epoch": 0.6352087114337568,
+      "grad_norm": 0.8076866269111633,
+      "learning_rate": 6.343012704174229e-05,
+      "loss": 3.9595,
+      "step": 700
+    },
+    {
+      "epoch": 0.7259528130671506,
+      "grad_norm": 0.7056506276130676,
+      "learning_rate": 7.250453720508167e-05,
+      "loss": 3.9345,
+      "step": 800
+    },
+    {
+      "epoch": 0.8166969147005445,
+      "grad_norm": 0.7730728387832642,
+      "learning_rate": 8.157894736842105e-05,
+      "loss": 3.8263,
+      "step": 900
+    },
+    {
+      "epoch": 0.9074410163339383,
+      "grad_norm": 0.6958891749382019,
+      "learning_rate": 9.065335753176044e-05,
+      "loss": 3.7584,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9981851179673321,
+      "grad_norm": 0.7870356440544128,
+      "learning_rate": 9.972776769509982e-05,
+      "loss": 3.6662,
+      "step": 1100
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 3.6291747093200684,
+      "eval_runtime": 9.9369,
+      "eval_samples_per_second": 394.189,
+      "eval_steps_per_second": 49.311,
+      "step": 1102
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 11020,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9210544128000000.0,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1102/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
+size 5777

checkpoint-11020/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 512,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": 3072,
+  "n_layer": 12,
+  "n_positions": 512,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.1",
+  "use_cache": true,
+  "vocab_size": 50000
+}

checkpoint-11020/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.55.1"
+}

checkpoint-11020/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8aa07a97c8b8148a133997c04520900b431feb8b1dbbd0c20c276ba90f3677b
+size 495411840

checkpoint-11020/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9303d39d90ef877f0b01fefc47f5a1755dd7d8463b6d5549f7ec0f98c4ecd8f3
+size 990920075

checkpoint-11020/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:029bc7ecc190ebd08eb7d9876b7268f48999ec5e606161c21b89560f5722aeab
+size 14645

checkpoint-11020/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2915d6e197efe6d8d0c73cfb2af961ab3f3d7743c08ba257267996ceef72893b
+size 1383

checkpoint-11020/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fb235176fde760bf3096416d5242276faf48d1d454591d8c36bd44dc6e96ae8
+size 1465

checkpoint-11020/trainer_state.json ADDED Viewed

	@@ -0,0 +1,884 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 11020,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09074410163339383,
+      "grad_norm": 1.4687509536743164,
+      "learning_rate": 8.98366606170599e-06,
+      "loss": 6.9172,
+      "step": 100
+    },
+    {
+      "epoch": 0.18148820326678766,
+      "grad_norm": 1.268220067024231,
+      "learning_rate": 1.8058076225045372e-05,
+      "loss": 5.2198,
+      "step": 200
+    },
+    {
+      "epoch": 0.27223230490018147,
+      "grad_norm": 0.9486992955207825,
+      "learning_rate": 2.7132486388384752e-05,
+      "loss": 4.6092,
+      "step": 300
+    },
+    {
+      "epoch": 0.3629764065335753,
+      "grad_norm": 0.8284947276115417,
+      "learning_rate": 3.620689655172414e-05,
+      "loss": 4.2682,
+      "step": 400
+    },
+    {
+      "epoch": 0.4537205081669691,
+      "grad_norm": 0.6980849504470825,
+      "learning_rate": 4.528130671506352e-05,
+      "loss": 4.2027,
+      "step": 500
+    },
+    {
+      "epoch": 0.5444646098003629,
+      "grad_norm": 0.9210988879203796,
+      "learning_rate": 5.435571687840291e-05,
+      "loss": 3.9613,
+      "step": 600
+    },
+    {
+      "epoch": 0.6352087114337568,
+      "grad_norm": 0.8076866269111633,
+      "learning_rate": 6.343012704174229e-05,
+      "loss": 3.9595,
+      "step": 700
+    },
+    {
+      "epoch": 0.7259528130671506,
+      "grad_norm": 0.7056506276130676,
+      "learning_rate": 7.250453720508167e-05,
+      "loss": 3.9345,
+      "step": 800
+    },
+    {
+      "epoch": 0.8166969147005445,
+      "grad_norm": 0.7730728387832642,
+      "learning_rate": 8.157894736842105e-05,
+      "loss": 3.8263,
+      "step": 900
+    },
+    {
+      "epoch": 0.9074410163339383,
+      "grad_norm": 0.6958891749382019,
+      "learning_rate": 9.065335753176044e-05,
+      "loss": 3.7584,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9981851179673321,
+      "grad_norm": 0.7870356440544128,
+      "learning_rate": 9.972776769509982e-05,
+      "loss": 3.6662,
+      "step": 1100
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 3.6291747093200684,
+      "eval_runtime": 9.9369,
+      "eval_samples_per_second": 394.189,
+      "eval_steps_per_second": 49.311,
+      "step": 1102
+    },
+    {
+      "epoch": 1.0889292196007259,
+      "grad_norm": 0.6388369798660278,
+      "learning_rate": 9.90219802379512e-05,
+      "loss": 3.5837,
+      "step": 1200
+    },
+    {
+      "epoch": 1.1796733212341197,
+      "grad_norm": 0.771931529045105,
+      "learning_rate": 9.80137124420246e-05,
+      "loss": 3.5696,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2704174228675136,
+      "grad_norm": 0.8356881141662598,
+      "learning_rate": 9.700544464609801e-05,
+      "loss": 3.4928,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3611615245009074,
+      "grad_norm": 0.7187588214874268,
+      "learning_rate": 9.599717685017141e-05,
+      "loss": 3.5173,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4519056261343013,
+      "grad_norm": 0.76460862159729,
+      "learning_rate": 9.498890905424481e-05,
+      "loss": 3.4,
+      "step": 1600
+    },
+    {
+      "epoch": 1.542649727767695,
+      "grad_norm": 0.7365128993988037,
+      "learning_rate": 9.398064125831822e-05,
+      "loss": 3.3335,
+      "step": 1700
+    },
+    {
+      "epoch": 1.633393829401089,
+      "grad_norm": 0.6376582980155945,
+      "learning_rate": 9.297237346239162e-05,
+      "loss": 3.2894,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 0.7320665121078491,
+      "learning_rate": 9.196410566646501e-05,
+      "loss": 3.2772,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8148820326678767,
+      "grad_norm": 1.4237512350082397,
+      "learning_rate": 9.095583787053841e-05,
+      "loss": 3.2934,
+      "step": 2000
+    },
+    {
+      "epoch": 1.9056261343012704,
+      "grad_norm": 0.7177993655204773,
+      "learning_rate": 8.994757007461182e-05,
+      "loss": 3.2374,
+      "step": 2100
+    },
+    {
+      "epoch": 1.9963702359346642,
+      "grad_norm": 0.7358414530754089,
+      "learning_rate": 8.893930227868522e-05,
+      "loss": 3.2032,
+      "step": 2200
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 3.1682567596435547,
+      "eval_runtime": 9.9556,
+      "eval_samples_per_second": 393.448,
+      "eval_steps_per_second": 49.219,
+      "step": 2204
+    },
+    {
+      "epoch": 2.087114337568058,
+      "grad_norm": 0.6569345593452454,
+      "learning_rate": 8.793103448275862e-05,
+      "loss": 3.1474,
+      "step": 2300
+    },
+    {
+      "epoch": 2.1778584392014517,
+      "grad_norm": 0.719578206539154,
+      "learning_rate": 8.692276668683203e-05,
+      "loss": 3.0573,
+      "step": 2400
+    },
+    {
+      "epoch": 2.268602540834846,
+      "grad_norm": 0.7317540645599365,
+      "learning_rate": 8.591449889090543e-05,
+      "loss": 3.1363,
+      "step": 2500
+    },
+    {
+      "epoch": 2.3593466424682394,
+      "grad_norm": 0.7985743880271912,
+      "learning_rate": 8.490623109497882e-05,
+      "loss": 3.0611,
+      "step": 2600
+    },
+    {
+      "epoch": 2.4500907441016335,
+      "grad_norm": 0.8037993907928467,
+      "learning_rate": 8.389796329905223e-05,
+      "loss": 3.0735,
+      "step": 2700
+    },
+    {
+      "epoch": 2.540834845735027,
+      "grad_norm": 0.7689797878265381,
+      "learning_rate": 8.288969550312563e-05,
+      "loss": 3.0639,
+      "step": 2800
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 0.7713281512260437,
+      "learning_rate": 8.188142770719903e-05,
+      "loss": 3.0165,
+      "step": 2900
+    },
+    {
+      "epoch": 2.722323049001815,
+      "grad_norm": 0.7961378693580627,
+      "learning_rate": 8.087315991127244e-05,
+      "loss": 2.9901,
+      "step": 3000
+    },
+    {
+      "epoch": 2.8130671506352085,
+      "grad_norm": 0.7088135480880737,
+      "learning_rate": 7.986489211534584e-05,
+      "loss": 3.0241,
+      "step": 3100
+    },
+    {
+      "epoch": 2.9038112522686026,
+      "grad_norm": 0.7496780157089233,
+      "learning_rate": 7.885662431941923e-05,
+      "loss": 2.9939,
+      "step": 3200
+    },
+    {
+      "epoch": 2.9945553539019962,
+      "grad_norm": 0.7221343517303467,
+      "learning_rate": 7.784835652349264e-05,
+      "loss": 2.9926,
+      "step": 3300
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 2.9603192806243896,
+      "eval_runtime": 10.1067,
+      "eval_samples_per_second": 387.563,
+      "eval_steps_per_second": 48.483,
+      "step": 3306
+    },
+    {
+      "epoch": 3.0852994555353903,
+      "grad_norm": 0.8570982217788696,
+      "learning_rate": 7.684008872756604e-05,
+      "loss": 2.9251,
+      "step": 3400
+    },
+    {
+      "epoch": 3.176043557168784,
+      "grad_norm": 0.7704641222953796,
+      "learning_rate": 7.583182093163945e-05,
+      "loss": 2.8877,
+      "step": 3500
+    },
+    {
+      "epoch": 3.266787658802178,
+      "grad_norm": 0.6863057017326355,
+      "learning_rate": 7.482355313571285e-05,
+      "loss": 2.8743,
+      "step": 3600
+    },
+    {
+      "epoch": 3.3575317604355717,
+      "grad_norm": 0.7227942943572998,
+      "learning_rate": 7.381528533978626e-05,
+      "loss": 2.823,
+      "step": 3700
+    },
+    {
+      "epoch": 3.4482758620689653,
+      "grad_norm": 0.7746195197105408,
+      "learning_rate": 7.280701754385966e-05,
+      "loss": 2.8962,
+      "step": 3800
+    },
+    {
+      "epoch": 3.5390199637023594,
+      "grad_norm": 0.7885217070579529,
+      "learning_rate": 7.179874974793306e-05,
+      "loss": 2.8419,
+      "step": 3900
+    },
+    {
+      "epoch": 3.629764065335753,
+      "grad_norm": 0.7623139023780823,
+      "learning_rate": 7.079048195200647e-05,
+      "loss": 2.8333,
+      "step": 4000
+    },
+    {
+      "epoch": 3.720508166969147,
+      "grad_norm": 0.8208196759223938,
+      "learning_rate": 6.978221415607986e-05,
+      "loss": 2.8302,
+      "step": 4100
+    },
+    {
+      "epoch": 3.8112522686025407,
+      "grad_norm": 0.7712786197662354,
+      "learning_rate": 6.877394636015326e-05,
+      "loss": 2.8674,
+      "step": 4200
+    },
+    {
+      "epoch": 3.901996370235935,
+      "grad_norm": 0.8100000023841858,
+      "learning_rate": 6.776567856422666e-05,
+      "loss": 2.8097,
+      "step": 4300
+    },
+    {
+      "epoch": 3.9927404718693285,
+      "grad_norm": 0.8472097516059875,
+      "learning_rate": 6.675741076830007e-05,
+      "loss": 2.8418,
+      "step": 4400
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.8345706462860107,
+      "eval_runtime": 9.9462,
+      "eval_samples_per_second": 393.819,
+      "eval_steps_per_second": 49.265,
+      "step": 4408
+    },
+    {
+      "epoch": 4.083484573502722,
+      "grad_norm": 0.8192525506019592,
+      "learning_rate": 6.574914297237347e-05,
+      "loss": 2.7376,
+      "step": 4500
+    },
+    {
+      "epoch": 4.174228675136116,
+      "grad_norm": 0.7768793106079102,
+      "learning_rate": 6.474087517644686e-05,
+      "loss": 2.7351,
+      "step": 4600
+    },
+    {
+      "epoch": 4.26497277676951,
+      "grad_norm": 0.8244798183441162,
+      "learning_rate": 6.373260738052027e-05,
+      "loss": 2.6991,
+      "step": 4700
+    },
+    {
+      "epoch": 4.3557168784029034,
+      "grad_norm": 0.8790801167488098,
+      "learning_rate": 6.272433958459367e-05,
+      "loss": 2.7034,
+      "step": 4800
+    },
+    {
+      "epoch": 4.4464609800362975,
+      "grad_norm": 0.7422960996627808,
+      "learning_rate": 6.171607178866707e-05,
+      "loss": 2.7208,
+      "step": 4900
+    },
+    {
+      "epoch": 4.537205081669692,
+      "grad_norm": 0.8407799601554871,
+      "learning_rate": 6.070780399274047e-05,
+      "loss": 2.7036,
+      "step": 5000
+    },
+    {
+      "epoch": 4.627949183303086,
+      "grad_norm": 0.8926665186882019,
+      "learning_rate": 5.969953619681388e-05,
+      "loss": 2.7198,
+      "step": 5100
+    },
+    {
+      "epoch": 4.718693284936479,
+      "grad_norm": 0.7808049917221069,
+      "learning_rate": 5.869126840088728e-05,
+      "loss": 2.706,
+      "step": 5200
+    },
+    {
+      "epoch": 4.809437386569873,
+      "grad_norm": 0.8481338024139404,
+      "learning_rate": 5.768300060496068e-05,
+      "loss": 2.6995,
+      "step": 5300
+    },
+    {
+      "epoch": 4.900181488203267,
+      "grad_norm": 0.8285768628120422,
+      "learning_rate": 5.667473280903408e-05,
+      "loss": 2.698,
+      "step": 5400
+    },
+    {
+      "epoch": 4.99092558983666,
+      "grad_norm": 0.8166932463645935,
+      "learning_rate": 5.566646501310748e-05,
+      "loss": 2.6797,
+      "step": 5500
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.74469256401062,
+      "eval_runtime": 9.9766,
+      "eval_samples_per_second": 392.619,
+      "eval_steps_per_second": 49.115,
+      "step": 5510
+    },
+    {
+      "epoch": 5.081669691470054,
+      "grad_norm": 0.8111730217933655,
+      "learning_rate": 5.4658197217180884e-05,
+      "loss": 2.5822,
+      "step": 5600
+    },
+    {
+      "epoch": 5.172413793103448,
+      "grad_norm": 0.7661839723587036,
+      "learning_rate": 5.364992942125429e-05,
+      "loss": 2.6167,
+      "step": 5700
+    },
+    {
+      "epoch": 5.2631578947368425,
+      "grad_norm": 0.8927045464515686,
+      "learning_rate": 5.2641661625327685e-05,
+      "loss": 2.6113,
+      "step": 5800
+    },
+    {
+      "epoch": 5.353901996370236,
+      "grad_norm": 0.9407956600189209,
+      "learning_rate": 5.163339382940109e-05,
+      "loss": 2.5686,
+      "step": 5900
+    },
+    {
+      "epoch": 5.44464609800363,
+      "grad_norm": 0.8638333082199097,
+      "learning_rate": 5.062512603347449e-05,
+      "loss": 2.6222,
+      "step": 6000
+    },
+    {
+      "epoch": 5.535390199637024,
+      "grad_norm": 0.8607461452484131,
+      "learning_rate": 4.96168582375479e-05,
+      "loss": 2.5987,
+      "step": 6100
+    },
+    {
+      "epoch": 5.626134301270417,
+      "grad_norm": 0.7911032438278198,
+      "learning_rate": 4.86085904416213e-05,
+      "loss": 2.5868,
+      "step": 6200
+    },
+    {
+      "epoch": 5.716878402903811,
+      "grad_norm": 0.8415167927742004,
+      "learning_rate": 4.76003226456947e-05,
+      "loss": 2.5615,
+      "step": 6300
+    },
+    {
+      "epoch": 5.807622504537205,
+      "grad_norm": 0.8212953805923462,
+      "learning_rate": 4.65920548497681e-05,
+      "loss": 2.5962,
+      "step": 6400
+    },
+    {
+      "epoch": 5.898366606170599,
+      "grad_norm": 0.8392799496650696,
+      "learning_rate": 4.55837870538415e-05,
+      "loss": 2.6055,
+      "step": 6500
+    },
+    {
+      "epoch": 5.9891107078039925,
+      "grad_norm": 0.8681470155715942,
+      "learning_rate": 4.4575519257914904e-05,
+      "loss": 2.5534,
+      "step": 6600
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.6848697662353516,
+      "eval_runtime": 9.9629,
+      "eval_samples_per_second": 393.16,
+      "eval_steps_per_second": 49.183,
+      "step": 6612
+    },
+    {
+      "epoch": 6.0798548094373865,
+      "grad_norm": 0.8970000147819519,
+      "learning_rate": 4.356725146198831e-05,
+      "loss": 2.5023,
+      "step": 6700
+    },
+    {
+      "epoch": 6.170598911070781,
+      "grad_norm": 0.9249663352966309,
+      "learning_rate": 4.2558983666061706e-05,
+      "loss": 2.514,
+      "step": 6800
+    },
+    {
+      "epoch": 6.261343012704174,
+      "grad_norm": 0.812436044216156,
+      "learning_rate": 4.155071587013511e-05,
+      "loss": 2.482,
+      "step": 6900
+    },
+    {
+      "epoch": 6.352087114337568,
+      "grad_norm": 0.9799634218215942,
+      "learning_rate": 4.0542448074208514e-05,
+      "loss": 2.4941,
+      "step": 7000
+    },
+    {
+      "epoch": 6.442831215970962,
+      "grad_norm": 0.9098866581916809,
+      "learning_rate": 3.953418027828191e-05,
+      "loss": 2.4911,
+      "step": 7100
+    },
+    {
+      "epoch": 6.533575317604356,
+      "grad_norm": 0.9876662492752075,
+      "learning_rate": 3.8525912482355315e-05,
+      "loss": 2.4846,
+      "step": 7200
+    },
+    {
+      "epoch": 6.624319419237749,
+      "grad_norm": 0.8980563282966614,
+      "learning_rate": 3.751764468642871e-05,
+      "loss": 2.5171,
+      "step": 7300
+    },
+    {
+      "epoch": 6.715063520871143,
+      "grad_norm": 0.893933117389679,
+      "learning_rate": 3.6509376890502117e-05,
+      "loss": 2.5136,
+      "step": 7400
+    },
+    {
+      "epoch": 6.805807622504537,
+      "grad_norm": 0.9221234321594238,
+      "learning_rate": 3.550110909457552e-05,
+      "loss": 2.4839,
+      "step": 7500
+    },
+    {
+      "epoch": 6.896551724137931,
+      "grad_norm": 1.0190584659576416,
+      "learning_rate": 3.449284129864892e-05,
+      "loss": 2.4976,
+      "step": 7600
+    },
+    {
+      "epoch": 6.987295825771325,
+      "grad_norm": 0.8790056109428406,
+      "learning_rate": 3.348457350272233e-05,
+      "loss": 2.4994,
+      "step": 7700
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 2.64707612991333,
+      "eval_runtime": 9.9673,
+      "eval_samples_per_second": 392.987,
+      "eval_steps_per_second": 49.161,
+      "step": 7714
+    },
+    {
+      "epoch": 7.078039927404719,
+      "grad_norm": 1.0930935144424438,
+      "learning_rate": 3.2476305706795726e-05,
+      "loss": 2.4629,
+      "step": 7800
+    },
+    {
+      "epoch": 7.168784029038113,
+      "grad_norm": 1.0013751983642578,
+      "learning_rate": 3.146803791086913e-05,
+      "loss": 2.4113,
+      "step": 7900
+    },
+    {
+      "epoch": 7.259528130671506,
+      "grad_norm": 0.9267390370368958,
+      "learning_rate": 3.045977011494253e-05,
+      "loss": 2.4304,
+      "step": 8000
+    },
+    {
+      "epoch": 7.3502722323049,
+      "grad_norm": 1.0082099437713623,
+      "learning_rate": 2.9451502319015935e-05,
+      "loss": 2.3784,
+      "step": 8100
+    },
+    {
+      "epoch": 7.441016333938294,
+      "grad_norm": 1.0260313749313354,
+      "learning_rate": 2.8443234523089336e-05,
+      "loss": 2.3913,
+      "step": 8200
+    },
+    {
+      "epoch": 7.531760435571687,
+      "grad_norm": 0.900663435459137,
+      "learning_rate": 2.7434966727162736e-05,
+      "loss": 2.4293,
+      "step": 8300
+    },
+    {
+      "epoch": 7.6225045372050815,
+      "grad_norm": 0.9622400999069214,
+      "learning_rate": 2.6426698931236137e-05,
+      "loss": 2.4511,
+      "step": 8400
+    },
+    {
+      "epoch": 7.713248638838476,
+      "grad_norm": 0.9455797076225281,
+      "learning_rate": 2.541843113530954e-05,
+      "loss": 2.4199,
+      "step": 8500
+    },
+    {
+      "epoch": 7.80399274047187,
+      "grad_norm": 1.0435410737991333,
+      "learning_rate": 2.441016333938294e-05,
+      "loss": 2.4473,
+      "step": 8600
+    },
+    {
+      "epoch": 7.894736842105263,
+      "grad_norm": 0.9524980187416077,
+      "learning_rate": 2.3401895543456342e-05,
+      "loss": 2.4318,
+      "step": 8700
+    },
+    {
+      "epoch": 7.985480943738657,
+      "grad_norm": 1.0212959051132202,
+      "learning_rate": 2.2393627747529743e-05,
+      "loss": 2.4208,
+      "step": 8800
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 2.624673843383789,
+      "eval_runtime": 9.9655,
+      "eval_samples_per_second": 393.055,
+      "eval_steps_per_second": 49.169,
+      "step": 8816
+    },
+    {
+      "epoch": 8.07622504537205,
+      "grad_norm": 1.0319713354110718,
+      "learning_rate": 2.1385359951603147e-05,
+      "loss": 2.4007,
+      "step": 8900
+    },
+    {
+      "epoch": 8.166969147005444,
+      "grad_norm": 0.9200676679611206,
+      "learning_rate": 2.037709215567655e-05,
+      "loss": 2.3846,
+      "step": 9000
+    },
+    {
+      "epoch": 8.257713248638838,
+      "grad_norm": 0.9439195394515991,
+      "learning_rate": 1.9368824359749952e-05,
+      "loss": 2.3628,
+      "step": 9100
+    },
+    {
+      "epoch": 8.348457350272232,
+      "grad_norm": 0.9906073808670044,
+      "learning_rate": 1.8360556563823353e-05,
+      "loss": 2.3659,
+      "step": 9200
+    },
+    {
+      "epoch": 8.439201451905626,
+      "grad_norm": 0.9741255640983582,
+      "learning_rate": 1.7352288767896753e-05,
+      "loss": 2.3134,
+      "step": 9300
+    },
+    {
+      "epoch": 8.52994555353902,
+      "grad_norm": 0.9208199977874756,
+      "learning_rate": 1.6344020971970157e-05,
+      "loss": 2.3716,
+      "step": 9400
+    },
+    {
+      "epoch": 8.620689655172415,
+      "grad_norm": 1.0154922008514404,
+      "learning_rate": 1.5335753176043558e-05,
+      "loss": 2.36,
+      "step": 9500
+    },
+    {
+      "epoch": 8.711433756805807,
+      "grad_norm": 1.1290541887283325,
+      "learning_rate": 1.4327485380116959e-05,
+      "loss": 2.384,
+      "step": 9600
+    },
+    {
+      "epoch": 8.802177858439201,
+      "grad_norm": 0.9401739835739136,
+      "learning_rate": 1.3319217584190361e-05,
+      "loss": 2.3736,
+      "step": 9700
+    },
+    {
+      "epoch": 8.892921960072595,
+      "grad_norm": 1.0175809860229492,
+      "learning_rate": 1.2310949788263763e-05,
+      "loss": 2.3405,
+      "step": 9800
+    },
+    {
+      "epoch": 8.98366606170599,
+      "grad_norm": 1.0658190250396729,
+      "learning_rate": 1.1302681992337164e-05,
+      "loss": 2.3499,
+      "step": 9900
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 2.6118369102478027,
+      "eval_runtime": 9.9649,
+      "eval_samples_per_second": 393.08,
+      "eval_steps_per_second": 49.173,
+      "step": 9918
+    },
+    {
+      "epoch": 9.074410163339383,
+      "grad_norm": 1.1189725399017334,
+      "learning_rate": 1.0294414196410568e-05,
+      "loss": 2.3341,
+      "step": 10000
+    },
+    {
+      "epoch": 9.165154264972777,
+      "grad_norm": 0.9556492567062378,
+      "learning_rate": 9.286146400483969e-06,
+      "loss": 2.3361,
+      "step": 10100
+    },
+    {
+      "epoch": 9.255898366606171,
+      "grad_norm": 1.1089420318603516,
+      "learning_rate": 8.277878604557371e-06,
+      "loss": 2.3237,
+      "step": 10200
+    },
+    {
+      "epoch": 9.346642468239564,
+      "grad_norm": 1.0827833414077759,
+      "learning_rate": 7.269610808630773e-06,
+      "loss": 2.3379,
+      "step": 10300
+    },
+    {
+      "epoch": 9.437386569872958,
+      "grad_norm": 1.1406363248825073,
+      "learning_rate": 6.2613430127041735e-06,
+      "loss": 2.3169,
+      "step": 10400
+    },
+    {
+      "epoch": 9.528130671506352,
+      "grad_norm": 1.1382629871368408,
+      "learning_rate": 5.253075216777576e-06,
+      "loss": 2.2953,
+      "step": 10500
+    },
+    {
+      "epoch": 9.618874773139746,
+      "grad_norm": 1.0905636548995972,
+      "learning_rate": 4.244807420850978e-06,
+      "loss": 2.3415,
+      "step": 10600
+    },
+    {
+      "epoch": 9.70961887477314,
+      "grad_norm": 1.0931600332260132,
+      "learning_rate": 3.2365396249243798e-06,
+      "loss": 2.3542,
+      "step": 10700
+    },
+    {
+      "epoch": 9.800362976406534,
+      "grad_norm": 1.0854520797729492,
+      "learning_rate": 2.228271828997782e-06,
+      "loss": 2.3083,
+      "step": 10800
+    },
+    {
+      "epoch": 9.891107078039928,
+      "grad_norm": 1.082558035850525,
+      "learning_rate": 1.2200040330711837e-06,
+      "loss": 2.2774,
+      "step": 10900
+    },
+    {
+      "epoch": 9.98185117967332,
+      "grad_norm": 0.9970278739929199,
+      "learning_rate": 2.117362371445856e-07,
+      "loss": 2.3158,
+      "step": 11000
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 2.6086275577545166,
+      "eval_runtime": 9.9766,
+      "eval_samples_per_second": 392.617,
+      "eval_steps_per_second": 49.115,
+      "step": 11020
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 11020,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.210544128e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-11020/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
+size 5777

checkpoint-2204/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 512,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": 3072,
+  "n_layer": 12,
+  "n_positions": 512,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.1",
+  "use_cache": true,
+  "vocab_size": 50000
+}

checkpoint-2204/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.55.1"
+}

checkpoint-2204/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:371cee5c4fda9aabfc97636c74270305a0270f106c1413026093e11a09467bd6
+size 495411840

checkpoint-2204/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af6573b11ed70c3954ca16c768614c905907654a161de392a33f3ce861d5e561
+size 990920075

checkpoint-2204/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e974088c31a741bdc9ef02bc6a3b26e16da31ea187c7c63f0f2dac77054eb596
+size 14645

checkpoint-2204/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efa92bbd79cb151c29c6f4763d58957bd29b39038ad1cafda589b6487915d47e
+size 1383

checkpoint-2204/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fad23755ed5a6a8765f66010c5b5d6c60e734321ac41a7e825717fb75460d3d8
+size 1465

checkpoint-2204/trainer_state.json ADDED Viewed

	@@ -0,0 +1,204 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 2204,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09074410163339383,
+      "grad_norm": 1.4687509536743164,
+      "learning_rate": 8.98366606170599e-06,
+      "loss": 6.9172,
+      "step": 100
+    },
+    {
+      "epoch": 0.18148820326678766,
+      "grad_norm": 1.268220067024231,
+      "learning_rate": 1.8058076225045372e-05,
+      "loss": 5.2198,
+      "step": 200
+    },
+    {
+      "epoch": 0.27223230490018147,
+      "grad_norm": 0.9486992955207825,
+      "learning_rate": 2.7132486388384752e-05,
+      "loss": 4.6092,
+      "step": 300
+    },
+    {
+      "epoch": 0.3629764065335753,
+      "grad_norm": 0.8284947276115417,
+      "learning_rate": 3.620689655172414e-05,
+      "loss": 4.2682,
+      "step": 400
+    },
+    {
+      "epoch": 0.4537205081669691,
+      "grad_norm": 0.6980849504470825,
+      "learning_rate": 4.528130671506352e-05,
+      "loss": 4.2027,
+      "step": 500
+    },
+    {
+      "epoch": 0.5444646098003629,
+      "grad_norm": 0.9210988879203796,
+      "learning_rate": 5.435571687840291e-05,
+      "loss": 3.9613,
+      "step": 600
+    },
+    {
+      "epoch": 0.6352087114337568,
+      "grad_norm": 0.8076866269111633,
+      "learning_rate": 6.343012704174229e-05,
+      "loss": 3.9595,
+      "step": 700
+    },
+    {
+      "epoch": 0.7259528130671506,
+      "grad_norm": 0.7056506276130676,
+      "learning_rate": 7.250453720508167e-05,
+      "loss": 3.9345,
+      "step": 800
+    },
+    {
+      "epoch": 0.8166969147005445,
+      "grad_norm": 0.7730728387832642,
+      "learning_rate": 8.157894736842105e-05,
+      "loss": 3.8263,
+      "step": 900
+    },
+    {
+      "epoch": 0.9074410163339383,
+      "grad_norm": 0.6958891749382019,
+      "learning_rate": 9.065335753176044e-05,
+      "loss": 3.7584,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9981851179673321,
+      "grad_norm": 0.7870356440544128,
+      "learning_rate": 9.972776769509982e-05,
+      "loss": 3.6662,
+      "step": 1100
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 3.6291747093200684,
+      "eval_runtime": 9.9369,
+      "eval_samples_per_second": 394.189,
+      "eval_steps_per_second": 49.311,
+      "step": 1102
+    },
+    {
+      "epoch": 1.0889292196007259,
+      "grad_norm": 0.6388369798660278,
+      "learning_rate": 9.90219802379512e-05,
+      "loss": 3.5837,
+      "step": 1200
+    },
+    {
+      "epoch": 1.1796733212341197,
+      "grad_norm": 0.771931529045105,
+      "learning_rate": 9.80137124420246e-05,
+      "loss": 3.5696,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2704174228675136,
+      "grad_norm": 0.8356881141662598,
+      "learning_rate": 9.700544464609801e-05,
+      "loss": 3.4928,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3611615245009074,
+      "grad_norm": 0.7187588214874268,
+      "learning_rate": 9.599717685017141e-05,
+      "loss": 3.5173,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4519056261343013,
+      "grad_norm": 0.76460862159729,
+      "learning_rate": 9.498890905424481e-05,
+      "loss": 3.4,
+      "step": 1600
+    },
+    {
+      "epoch": 1.542649727767695,
+      "grad_norm": 0.7365128993988037,
+      "learning_rate": 9.398064125831822e-05,
+      "loss": 3.3335,
+      "step": 1700
+    },
+    {
+      "epoch": 1.633393829401089,
+      "grad_norm": 0.6376582980155945,
+      "learning_rate": 9.297237346239162e-05,
+      "loss": 3.2894,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 0.7320665121078491,
+      "learning_rate": 9.196410566646501e-05,
+      "loss": 3.2772,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8148820326678767,
+      "grad_norm": 1.4237512350082397,
+      "learning_rate": 9.095583787053841e-05,
+      "loss": 3.2934,
+      "step": 2000
+    },
+    {
+      "epoch": 1.9056261343012704,
+      "grad_norm": 0.7177993655204773,
+      "learning_rate": 8.994757007461182e-05,
+      "loss": 3.2374,
+      "step": 2100
+    },
+    {
+      "epoch": 1.9963702359346642,
+      "grad_norm": 0.7358414530754089,
+      "learning_rate": 8.893930227868522e-05,
+      "loss": 3.2032,
+      "step": 2200
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 3.1682567596435547,
+      "eval_runtime": 9.9556,
+      "eval_samples_per_second": 393.448,
+      "eval_steps_per_second": 49.219,
+      "step": 2204
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 11020,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.8421088256e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2204/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
+size 5777

checkpoint-3306/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 512,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": 3072,
+  "n_layer": 12,
+  "n_positions": 512,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.1",
+  "use_cache": true,
+  "vocab_size": 50000
+}

checkpoint-3306/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.55.1"
+}

checkpoint-3306/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a08a2090b45354e5af5eb1cebf64dfe28315682bb8063642e0f2d8714b9061b
+size 495411840

checkpoint-3306/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe483895c2f126675278e9e99504ce61a369a987775c0b58bd97a2c62b86a509
+size 990920075

checkpoint-3306/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa7b51df299b32b1d15e80da6dccb211575cefa4973745a4a0aadfa9393436ab
+size 14645

checkpoint-3306/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb3f993f19242cc21b249926529d9621fcd4005c95c36819f3b93098771d8c48
+size 1383

checkpoint-3306/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8098d0a50b0a08da1532b22288d46dbd7b7ecd90ddf43ed734ed15ffaafe599
+size 1465

checkpoint-3306/trainer_state.json ADDED Viewed

	@@ -0,0 +1,289 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 3306,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09074410163339383,
+      "grad_norm": 1.4687509536743164,
+      "learning_rate": 8.98366606170599e-06,
+      "loss": 6.9172,
+      "step": 100
+    },
+    {
+      "epoch": 0.18148820326678766,
+      "grad_norm": 1.268220067024231,
+      "learning_rate": 1.8058076225045372e-05,
+      "loss": 5.2198,
+      "step": 200
+    },
+    {
+      "epoch": 0.27223230490018147,
+      "grad_norm": 0.9486992955207825,
+      "learning_rate": 2.7132486388384752e-05,
+      "loss": 4.6092,
+      "step": 300
+    },
+    {
+      "epoch": 0.3629764065335753,
+      "grad_norm": 0.8284947276115417,
+      "learning_rate": 3.620689655172414e-05,
+      "loss": 4.2682,
+      "step": 400
+    },
+    {
+      "epoch": 0.4537205081669691,
+      "grad_norm": 0.6980849504470825,
+      "learning_rate": 4.528130671506352e-05,
+      "loss": 4.2027,
+      "step": 500
+    },
+    {
+      "epoch": 0.5444646098003629,
+      "grad_norm": 0.9210988879203796,
+      "learning_rate": 5.435571687840291e-05,
+      "loss": 3.9613,
+      "step": 600
+    },
+    {
+      "epoch": 0.6352087114337568,
+      "grad_norm": 0.8076866269111633,
+      "learning_rate": 6.343012704174229e-05,
+      "loss": 3.9595,
+      "step": 700
+    },
+    {
+      "epoch": 0.7259528130671506,
+      "grad_norm": 0.7056506276130676,
+      "learning_rate": 7.250453720508167e-05,
+      "loss": 3.9345,
+      "step": 800
+    },
+    {
+      "epoch": 0.8166969147005445,
+      "grad_norm": 0.7730728387832642,
+      "learning_rate": 8.157894736842105e-05,
+      "loss": 3.8263,
+      "step": 900
+    },
+    {
+      "epoch": 0.9074410163339383,
+      "grad_norm": 0.6958891749382019,
+      "learning_rate": 9.065335753176044e-05,
+      "loss": 3.7584,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9981851179673321,
+      "grad_norm": 0.7870356440544128,
+      "learning_rate": 9.972776769509982e-05,
+      "loss": 3.6662,
+      "step": 1100
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 3.6291747093200684,
+      "eval_runtime": 9.9369,
+      "eval_samples_per_second": 394.189,
+      "eval_steps_per_second": 49.311,
+      "step": 1102
+    },
+    {
+      "epoch": 1.0889292196007259,
+      "grad_norm": 0.6388369798660278,
+      "learning_rate": 9.90219802379512e-05,
+      "loss": 3.5837,
+      "step": 1200
+    },
+    {
+      "epoch": 1.1796733212341197,
+      "grad_norm": 0.771931529045105,
+      "learning_rate": 9.80137124420246e-05,
+      "loss": 3.5696,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2704174228675136,
+      "grad_norm": 0.8356881141662598,
+      "learning_rate": 9.700544464609801e-05,
+      "loss": 3.4928,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3611615245009074,
+      "grad_norm": 0.7187588214874268,
+      "learning_rate": 9.599717685017141e-05,
+      "loss": 3.5173,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4519056261343013,
+      "grad_norm": 0.76460862159729,
+      "learning_rate": 9.498890905424481e-05,
+      "loss": 3.4,
+      "step": 1600
+    },
+    {
+      "epoch": 1.542649727767695,
+      "grad_norm": 0.7365128993988037,
+      "learning_rate": 9.398064125831822e-05,
+      "loss": 3.3335,
+      "step": 1700
+    },
+    {
+      "epoch": 1.633393829401089,
+      "grad_norm": 0.6376582980155945,
+      "learning_rate": 9.297237346239162e-05,
+      "loss": 3.2894,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 0.7320665121078491,
+      "learning_rate": 9.196410566646501e-05,
+      "loss": 3.2772,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8148820326678767,
+      "grad_norm": 1.4237512350082397,
+      "learning_rate": 9.095583787053841e-05,
+      "loss": 3.2934,
+      "step": 2000
+    },
+    {
+      "epoch": 1.9056261343012704,
+      "grad_norm": 0.7177993655204773,
+      "learning_rate": 8.994757007461182e-05,
+      "loss": 3.2374,
+      "step": 2100
+    },
+    {
+      "epoch": 1.9963702359346642,
+      "grad_norm": 0.7358414530754089,
+      "learning_rate": 8.893930227868522e-05,
+      "loss": 3.2032,
+      "step": 2200
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 3.1682567596435547,
+      "eval_runtime": 9.9556,
+      "eval_samples_per_second": 393.448,
+      "eval_steps_per_second": 49.219,
+      "step": 2204
+    },
+    {
+      "epoch": 2.087114337568058,
+      "grad_norm": 0.6569345593452454,
+      "learning_rate": 8.793103448275862e-05,
+      "loss": 3.1474,
+      "step": 2300
+    },
+    {
+      "epoch": 2.1778584392014517,
+      "grad_norm": 0.719578206539154,
+      "learning_rate": 8.692276668683203e-05,
+      "loss": 3.0573,
+      "step": 2400
+    },
+    {
+      "epoch": 2.268602540834846,
+      "grad_norm": 0.7317540645599365,
+      "learning_rate": 8.591449889090543e-05,
+      "loss": 3.1363,
+      "step": 2500
+    },
+    {
+      "epoch": 2.3593466424682394,
+      "grad_norm": 0.7985743880271912,
+      "learning_rate": 8.490623109497882e-05,
+      "loss": 3.0611,
+      "step": 2600
+    },
+    {
+      "epoch": 2.4500907441016335,
+      "grad_norm": 0.8037993907928467,
+      "learning_rate": 8.389796329905223e-05,
+      "loss": 3.0735,
+      "step": 2700
+    },
+    {
+      "epoch": 2.540834845735027,
+      "grad_norm": 0.7689797878265381,
+      "learning_rate": 8.288969550312563e-05,
+      "loss": 3.0639,
+      "step": 2800
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 0.7713281512260437,
+      "learning_rate": 8.188142770719903e-05,
+      "loss": 3.0165,
+      "step": 2900
+    },
+    {
+      "epoch": 2.722323049001815,
+      "grad_norm": 0.7961378693580627,
+      "learning_rate": 8.087315991127244e-05,
+      "loss": 2.9901,
+      "step": 3000
+    },
+    {
+      "epoch": 2.8130671506352085,
+      "grad_norm": 0.7088135480880737,
+      "learning_rate": 7.986489211534584e-05,
+      "loss": 3.0241,
+      "step": 3100
+    },
+    {
+      "epoch": 2.9038112522686026,
+      "grad_norm": 0.7496780157089233,
+      "learning_rate": 7.885662431941923e-05,
+      "loss": 2.9939,
+      "step": 3200
+    },
+    {
+      "epoch": 2.9945553539019962,
+      "grad_norm": 0.7221343517303467,
+      "learning_rate": 7.784835652349264e-05,
+      "loss": 2.9926,
+      "step": 3300
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 2.9603192806243896,
+      "eval_runtime": 10.1067,
+      "eval_samples_per_second": 387.563,
+      "eval_steps_per_second": 48.483,
+      "step": 3306
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 11020,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.7631632384e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-3306/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
+size 5777

checkpoint-4408/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 512,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": 3072,
+  "n_layer": 12,
+  "n_positions": 512,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.1",
+  "use_cache": true,
+  "vocab_size": 50000
+}

checkpoint-4408/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.55.1"
+}

checkpoint-4408/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a233baa8d1a111b7be64b7d49d53720fc77a86f35a7a5e2b4b4b50aadcb00832
+size 495411840

checkpoint-4408/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82bfe263f2408c02d905a1ae42fc1ace5850810e781a2068eb873b4f88cdeba6
+size 990920075

checkpoint-4408/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb15f127c28eb75998b71c1ec183731c8b207892736e6e627ee2c73156db517
+size 14645

checkpoint-4408/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a1d6b1539486d741d691f9bae9a67afd921fe54bf7cd8cabd2c75c1229a85bb
+size 1383

checkpoint-4408/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31d76e4d6318626af04f0f85be2e6a54f3a0008533d32ff287c639030e907fb1
+size 1465

checkpoint-4408/trainer_state.json ADDED Viewed

	@@ -0,0 +1,374 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.0,
+  "eval_steps": 500,
+  "global_step": 4408,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09074410163339383,
+      "grad_norm": 1.4687509536743164,
+      "learning_rate": 8.98366606170599e-06,
+      "loss": 6.9172,
+      "step": 100
+    },
+    {
+      "epoch": 0.18148820326678766,
+      "grad_norm": 1.268220067024231,
+      "learning_rate": 1.8058076225045372e-05,
+      "loss": 5.2198,
+      "step": 200
+    },
+    {
+      "epoch": 0.27223230490018147,
+      "grad_norm": 0.9486992955207825,
+      "learning_rate": 2.7132486388384752e-05,
+      "loss": 4.6092,
+      "step": 300
+    },
+    {
+      "epoch": 0.3629764065335753,
+      "grad_norm": 0.8284947276115417,
+      "learning_rate": 3.620689655172414e-05,
+      "loss": 4.2682,
+      "step": 400
+    },
+    {
+      "epoch": 0.4537205081669691,
+      "grad_norm": 0.6980849504470825,
+      "learning_rate": 4.528130671506352e-05,
+      "loss": 4.2027,
+      "step": 500
+    },
+    {
+      "epoch": 0.5444646098003629,
+      "grad_norm": 0.9210988879203796,
+      "learning_rate": 5.435571687840291e-05,
+      "loss": 3.9613,
+      "step": 600
+    },
+    {
+      "epoch": 0.6352087114337568,
+      "grad_norm": 0.8076866269111633,
+      "learning_rate": 6.343012704174229e-05,
+      "loss": 3.9595,
+      "step": 700
+    },
+    {
+      "epoch": 0.7259528130671506,
+      "grad_norm": 0.7056506276130676,
+      "learning_rate": 7.250453720508167e-05,
+      "loss": 3.9345,
+      "step": 800
+    },
+    {
+      "epoch": 0.8166969147005445,
+      "grad_norm": 0.7730728387832642,
+      "learning_rate": 8.157894736842105e-05,
+      "loss": 3.8263,
+      "step": 900
+    },
+    {
+      "epoch": 0.9074410163339383,
+      "grad_norm": 0.6958891749382019,
+      "learning_rate": 9.065335753176044e-05,
+      "loss": 3.7584,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9981851179673321,
+      "grad_norm": 0.7870356440544128,
+      "learning_rate": 9.972776769509982e-05,
+      "loss": 3.6662,
+      "step": 1100
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 3.6291747093200684,
+      "eval_runtime": 9.9369,
+      "eval_samples_per_second": 394.189,
+      "eval_steps_per_second": 49.311,
+      "step": 1102
+    },
+    {
+      "epoch": 1.0889292196007259,
+      "grad_norm": 0.6388369798660278,
+      "learning_rate": 9.90219802379512e-05,
+      "loss": 3.5837,
+      "step": 1200
+    },
+    {
+      "epoch": 1.1796733212341197,
+      "grad_norm": 0.771931529045105,
+      "learning_rate": 9.80137124420246e-05,
+      "loss": 3.5696,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2704174228675136,
+      "grad_norm": 0.8356881141662598,
+      "learning_rate": 9.700544464609801e-05,
+      "loss": 3.4928,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3611615245009074,
+      "grad_norm": 0.7187588214874268,
+      "learning_rate": 9.599717685017141e-05,
+      "loss": 3.5173,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4519056261343013,
+      "grad_norm": 0.76460862159729,
+      "learning_rate": 9.498890905424481e-05,
+      "loss": 3.4,
+      "step": 1600
+    },
+    {
+      "epoch": 1.542649727767695,
+      "grad_norm": 0.7365128993988037,
+      "learning_rate": 9.398064125831822e-05,
+      "loss": 3.3335,
+      "step": 1700
+    },
+    {
+      "epoch": 1.633393829401089,
+      "grad_norm": 0.6376582980155945,
+      "learning_rate": 9.297237346239162e-05,
+      "loss": 3.2894,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7241379310344827,
+      "grad_norm": 0.7320665121078491,
+      "learning_rate": 9.196410566646501e-05,
+      "loss": 3.2772,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8148820326678767,
+      "grad_norm": 1.4237512350082397,
+      "learning_rate": 9.095583787053841e-05,
+      "loss": 3.2934,
+      "step": 2000
+    },
+    {
+      "epoch": 1.9056261343012704,
+      "grad_norm": 0.7177993655204773,
+      "learning_rate": 8.994757007461182e-05,
+      "loss": 3.2374,
+      "step": 2100
+    },
+    {
+      "epoch": 1.9963702359346642,
+      "grad_norm": 0.7358414530754089,
+      "learning_rate": 8.893930227868522e-05,
+      "loss": 3.2032,
+      "step": 2200
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 3.1682567596435547,
+      "eval_runtime": 9.9556,
+      "eval_samples_per_second": 393.448,
+      "eval_steps_per_second": 49.219,
+      "step": 2204
+    },
+    {
+      "epoch": 2.087114337568058,
+      "grad_norm": 0.6569345593452454,
+      "learning_rate": 8.793103448275862e-05,
+      "loss": 3.1474,
+      "step": 2300
+    },
+    {
+      "epoch": 2.1778584392014517,
+      "grad_norm": 0.719578206539154,
+      "learning_rate": 8.692276668683203e-05,
+      "loss": 3.0573,
+      "step": 2400
+    },
+    {
+      "epoch": 2.268602540834846,
+      "grad_norm": 0.7317540645599365,
+      "learning_rate": 8.591449889090543e-05,
+      "loss": 3.1363,
+      "step": 2500
+    },
+    {
+      "epoch": 2.3593466424682394,
+      "grad_norm": 0.7985743880271912,
+      "learning_rate": 8.490623109497882e-05,
+      "loss": 3.0611,
+      "step": 2600
+    },
+    {
+      "epoch": 2.4500907441016335,
+      "grad_norm": 0.8037993907928467,
+      "learning_rate": 8.389796329905223e-05,
+      "loss": 3.0735,
+      "step": 2700
+    },
+    {
+      "epoch": 2.540834845735027,
+      "grad_norm": 0.7689797878265381,
+      "learning_rate": 8.288969550312563e-05,
+      "loss": 3.0639,
+      "step": 2800
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 0.7713281512260437,
+      "learning_rate": 8.188142770719903e-05,
+      "loss": 3.0165,
+      "step": 2900
+    },
+    {
+      "epoch": 2.722323049001815,
+      "grad_norm": 0.7961378693580627,
+      "learning_rate": 8.087315991127244e-05,
+      "loss": 2.9901,
+      "step": 3000
+    },
+    {
+      "epoch": 2.8130671506352085,
+      "grad_norm": 0.7088135480880737,
+      "learning_rate": 7.986489211534584e-05,
+      "loss": 3.0241,
+      "step": 3100
+    },
+    {
+      "epoch": 2.9038112522686026,
+      "grad_norm": 0.7496780157089233,
+      "learning_rate": 7.885662431941923e-05,
+      "loss": 2.9939,
+      "step": 3200
+    },
+    {
+      "epoch": 2.9945553539019962,
+      "grad_norm": 0.7221343517303467,
+      "learning_rate": 7.784835652349264e-05,
+      "loss": 2.9926,
+      "step": 3300
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 2.9603192806243896,
+      "eval_runtime": 10.1067,
+      "eval_samples_per_second": 387.563,
+      "eval_steps_per_second": 48.483,
+      "step": 3306
+    },
+    {
+      "epoch": 3.0852994555353903,
+      "grad_norm": 0.8570982217788696,
+      "learning_rate": 7.684008872756604e-05,
+      "loss": 2.9251,
+      "step": 3400
+    },
+    {
+      "epoch": 3.176043557168784,
+      "grad_norm": 0.7704641222953796,
+      "learning_rate": 7.583182093163945e-05,
+      "loss": 2.8877,
+      "step": 3500
+    },
+    {
+      "epoch": 3.266787658802178,
+      "grad_norm": 0.6863057017326355,
+      "learning_rate": 7.482355313571285e-05,
+      "loss": 2.8743,
+      "step": 3600
+    },
+    {
+      "epoch": 3.3575317604355717,
+      "grad_norm": 0.7227942943572998,
+      "learning_rate": 7.381528533978626e-05,
+      "loss": 2.823,
+      "step": 3700
+    },
+    {
+      "epoch": 3.4482758620689653,
+      "grad_norm": 0.7746195197105408,
+      "learning_rate": 7.280701754385966e-05,
+      "loss": 2.8962,
+      "step": 3800
+    },
+    {
+      "epoch": 3.5390199637023594,
+      "grad_norm": 0.7885217070579529,
+      "learning_rate": 7.179874974793306e-05,
+      "loss": 2.8419,
+      "step": 3900
+    },
+    {
+      "epoch": 3.629764065335753,
+      "grad_norm": 0.7623139023780823,
+      "learning_rate": 7.079048195200647e-05,
+      "loss": 2.8333,
+      "step": 4000
+    },
+    {
+      "epoch": 3.720508166969147,
+      "grad_norm": 0.8208196759223938,
+      "learning_rate": 6.978221415607986e-05,
+      "loss": 2.8302,
+      "step": 4100
+    },
+    {
+      "epoch": 3.8112522686025407,
+      "grad_norm": 0.7712786197662354,
+      "learning_rate": 6.877394636015326e-05,
+      "loss": 2.8674,
+      "step": 4200
+    },
+    {
+      "epoch": 3.901996370235935,
+      "grad_norm": 0.8100000023841858,
+      "learning_rate": 6.776567856422666e-05,
+      "loss": 2.8097,
+      "step": 4300
+    },
+    {
+      "epoch": 3.9927404718693285,
+      "grad_norm": 0.8472097516059875,
+      "learning_rate": 6.675741076830007e-05,
+      "loss": 2.8418,
+      "step": 4400
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.8345706462860107,
+      "eval_runtime": 9.9462,
+      "eval_samples_per_second": 393.819,
+      "eval_steps_per_second": 49.265,
+      "step": 4408
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 11020,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.6842176512e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-4408/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38bc65f390e0cd251f8bf044d2e93852c65dce9b93fbcd2f5d4558ba06c3e0b4
+size 5777

checkpoint-5510/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 512,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": 3072,
+  "n_layer": 12,
+  "n_positions": 512,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.1",
+  "use_cache": true,
+  "vocab_size": 50000
+}

checkpoint-5510/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.55.1"
+}

checkpoint-5510/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e1913fca505749d24fa57f254a8ca8c8eb4c1a1960bc8aed3638d48d0de2f3f
+size 495411840

checkpoint-5510/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7791d9266744c6b12ac9a1ee4226f85b1431337a7798ba3752d28b413c11aadd
+size 990920075

checkpoint-5510/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e04e2fc3463de6b54ec5a12876a98d410df180d34e08b28d2dcf75fe273573f
+size 14645