stefanocarrera commited on Feb 9

Commit

8157237

verified ·

1 Parent(s): bbd5ec7

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

.gitattributes +2 -0
README.md +45 -182
adapter_config.json +3 -3
adapter_model.safetensors +1 -1
checkpoint-150/README.md +209 -0
checkpoint-150/adapter_config.json +46 -0
checkpoint-150/adapter_model.safetensors +3 -0
checkpoint-150/chat_template.jinja +109 -0
checkpoint-150/optimizer.pt +3 -0
checkpoint-150/rng_state.pth +3 -0
checkpoint-150/scheduler.pt +3 -0
checkpoint-150/tokenizer.json +3 -0
checkpoint-150/tokenizer_config.json +14 -0
checkpoint-150/trainer_state.json +1567 -0
checkpoint-150/training_args.bin +3 -0
checkpoint-164/README.md +209 -0
checkpoint-164/adapter_config.json +46 -0
checkpoint-164/adapter_model.safetensors +3 -0
checkpoint-164/chat_template.jinja +109 -0
checkpoint-164/optimizer.pt +3 -0
checkpoint-164/rng_state.pth +3 -0
checkpoint-164/scheduler.pt +3 -0
checkpoint-164/tokenizer.json +3 -0
checkpoint-164/tokenizer_config.json +14 -0
checkpoint-164/trainer_state.json +1707 -0
checkpoint-164/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-164/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,199 +1,62 @@
 ---
-library_name: transformers
-tags: []
 ---
-# Model Card for Model ID
-<!-- Provide a quick summary of what the model is/does. -->
-## Model Details
-### Model Description
-<!-- Provide a longer summary of what this model is. -->
-This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
-- **Developed by:** [More Information Needed]
-- **Funded by [optional]:** [More Information Needed]
-- **Shared by [optional]:** [More Information Needed]
-- **Model type:** [More Information Needed]
-- **Language(s) (NLP):** [More Information Needed]
-- **License:** [More Information Needed]
-- **Finetuned from model [optional]:** [More Information Needed]
-### Model Sources [optional]
-<!-- Provide the basic links for the model. -->
-- **Repository:** [More Information Needed]
-- **Paper [optional]:** [More Information Needed]
-- **Demo [optional]:** [More Information Needed]
-## Uses
-<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
-### Direct Use
-<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
-[More Information Needed]
-### Downstream Use [optional]
-<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-[More Information Needed]
-### Out-of-Scope Use
-<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
-[More Information Needed]
-## Bias, Risks, and Limitations
-<!-- This section is meant to convey both technical and sociotechnical limitations. -->
-[More Information Needed]
-### Recommendations
-<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
-## How to Get Started with the Model
-Use the code below to get started with the model.
-[More Information Needed]
-## Training Details
-### Training Data
-<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
-[More Information Needed]
-### Training Procedure
-<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
-#### Preprocessing [optional]
-[More Information Needed]
-#### Training Hyperparameters
-- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
-#### Speeds, Sizes, Times [optional]
-<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
-[More Information Needed]
-## Evaluation
-<!-- This section describes the evaluation protocols and provides the results. -->
-### Testing Data, Factors & Metrics
-#### Testing Data
-<!-- This should link to a Dataset Card if possible. -->
-[More Information Needed]
-#### Factors
-<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
-[More Information Needed]
-#### Metrics
-<!-- These are the evaluation metrics being used, ideally with a description of why. -->
-[More Information Needed]
-### Results
-[More Information Needed]
-#### Summary
-## Model Examination [optional]
-<!-- Relevant interpretability work for the model goes here -->
-[More Information Needed]
-## Environmental Impact
-<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
-Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
-- **Hardware Type:** [More Information Needed]
-- **Hours used:** [More Information Needed]
-- **Cloud Provider:** [More Information Needed]
-- **Compute Region:** [More Information Needed]
-- **Carbon Emitted:** [More Information Needed]
-## Technical Specifications [optional]
-### Model Architecture and Objective
-[More Information Needed]
-### Compute Infrastructure
-[More Information Needed]
-#### Hardware
-[More Information Needed]
-#### Software
-[More Information Needed]
-## Citation [optional]
-<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
-**BibTeX:**
-[More Information Needed]
-**APA:**
-[More Information Needed]
-## Glossary [optional]
-<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
-[More Information Needed]
-## More Information [optional]
-[More Information Needed]
-## Model Card Authors [optional]
-[More Information Needed]
-## Model Card Contact
-[More Information Needed]

 ---
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+library_name: peft
+model_name: adapters
+tags:
+- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
 ---
+# Model Card for adapters
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.18.1
+- TRL: 0.27.2
+- Transformers: 5.0.0
+- Pytorch: 2.7.1+cu118
+- Datasets: 4.5.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

adapter_config.json CHANGED Viewed

@@ -29,13 +29,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
     "gate_proj",
     "up_proj",
     "down_proj",
-    "q_proj",
     "v_proj",
-    "o_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "gate_proj",
+    "k_proj",
     "up_proj",
+    "o_proj",
     "down_proj",
     "v_proj",
+    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:80580e05af51599efe0ac938919c1735da68fa8763b757642faba033692fa00b
 size 83946192

 version https://git-lfs.github.com/spec/v1
+oid sha256:b4771c284f33bf9ad924ce4256b47cc6c7144065f0b9859ed6644024d9f16d25
 size 83946192

checkpoint-150/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-150/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-150/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76d3aefd213f0efaad1eff180e520ebcdb938b53b053ba29018e27247d548c29
+size 83946192

checkpoint-150/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

checkpoint-150/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66eed95e5d86bca4c48256356b70747e00cdddad8f6eaad5fe9b284bd39a9e4b
+size 85728997

checkpoint-150/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
+size 14645

checkpoint-150/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:969e38d9df417d5c2e205dc9478ec3ed9bbd0fb36ac227be3ef304a3ef428a63
+size 1465

checkpoint-150/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

checkpoint-150/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}

checkpoint-150/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1567 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.8330781010719757,
+  "eval_steps": 50,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 0.5357014574110508,
+      "epoch": 0.01225114854517611,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002,
+      "loss": 0.2591501772403717,
+      "mean_token_accuracy": 0.9530366845428944,
+      "num_tokens": 7575.0,
+      "step": 1
+    },
+    {
+      "entropy": 0.4969524908810854,
+      "epoch": 0.02450229709035222,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.00019878048780487805,
+      "loss": 0.19432324171066284,
+      "mean_token_accuracy": 0.9566738158464432,
+      "num_tokens": 16374.0,
+      "step": 2
+    },
+    {
+      "entropy": 0.5178094431757927,
+      "epoch": 0.036753445635528334,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001975609756097561,
+      "loss": 0.15099577605724335,
+      "mean_token_accuracy": 0.9616630300879478,
+      "num_tokens": 22935.0,
+      "step": 3
+    },
+    {
+      "entropy": 0.5130382943898439,
+      "epoch": 0.04900459418070444,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00019634146341463416,
+      "loss": 0.12276138365268707,
+      "mean_token_accuracy": 0.967245552688837,
+      "num_tokens": 28404.0,
+      "step": 4
+    },
+    {
+      "entropy": 0.44963230937719345,
+      "epoch": 0.06125574272588055,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001951219512195122,
+      "loss": 0.09725309163331985,
+      "mean_token_accuracy": 0.9740313775837421,
+      "num_tokens": 35549.0,
+      "step": 5
+    },
+    {
+      "entropy": 0.4456397518515587,
+      "epoch": 0.07350689127105667,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00019390243902439025,
+      "loss": 0.1305171251296997,
+      "mean_token_accuracy": 0.9666509628295898,
+      "num_tokens": 42383.0,
+      "step": 6
+    },
+    {
+      "entropy": 0.3939209319651127,
+      "epoch": 0.08575803981623277,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001926829268292683,
+      "loss": 0.09472239017486572,
+      "mean_token_accuracy": 0.974840272217989,
+      "num_tokens": 49995.0,
+      "step": 7
+    },
+    {
+      "entropy": 0.41255801543593407,
+      "epoch": 0.09800918836140889,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.00019146341463414633,
+      "loss": 0.09030990302562714,
+      "mean_token_accuracy": 0.9773416742682457,
+      "num_tokens": 56829.0,
+      "step": 8
+    },
+    {
+      "entropy": 0.41458161920309067,
+      "epoch": 0.11026033690658499,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001902439024390244,
+      "loss": 0.10670135170221329,
+      "mean_token_accuracy": 0.9706939905881882,
+      "num_tokens": 63889.0,
+      "step": 9
+    },
+    {
+      "entropy": 0.38586485385894775,
+      "epoch": 0.1225114854517611,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00018902439024390244,
+      "loss": 0.11325996369123459,
+      "mean_token_accuracy": 0.9668439663946629,
+      "num_tokens": 70927.0,
+      "step": 10
+    },
+    {
+      "entropy": 0.3774546254426241,
+      "epoch": 0.13476263399693722,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001878048780487805,
+      "loss": 0.07076064497232437,
+      "mean_token_accuracy": 0.9780124798417091,
+      "num_tokens": 78426.0,
+      "step": 11
+    },
+    {
+      "entropy": 0.3956366563215852,
+      "epoch": 0.14701378254211334,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.00018658536585365856,
+      "loss": 0.08720126003026962,
+      "mean_token_accuracy": 0.9750143773853779,
+      "num_tokens": 87141.0,
+      "step": 12
+    },
+    {
+      "entropy": 0.37218748684972525,
+      "epoch": 0.15926493108728942,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001853658536585366,
+      "loss": 0.08891582489013672,
+      "mean_token_accuracy": 0.9692744836211205,
+      "num_tokens": 95681.0,
+      "step": 13
+    },
+    {
+      "entropy": 0.3841435620561242,
+      "epoch": 0.17151607963246554,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00018414634146341464,
+      "loss": 0.08403376489877701,
+      "mean_token_accuracy": 0.9721223004162312,
+      "num_tokens": 102341.0,
+      "step": 14
+    },
+    {
+      "entropy": 0.34824744425714016,
+      "epoch": 0.18376722817764166,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001829268292682927,
+      "loss": 0.10026705265045166,
+      "mean_token_accuracy": 0.9667329825460911,
+      "num_tokens": 109665.0,
+      "step": 15
+    },
+    {
+      "entropy": 0.3537330040708184,
+      "epoch": 0.19601837672281777,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00018170731707317075,
+      "loss": 0.06809324026107788,
+      "mean_token_accuracy": 0.9803769923746586,
+      "num_tokens": 116212.0,
+      "step": 16
+    },
+    {
+      "entropy": 0.3518688417971134,
+      "epoch": 0.2082695252679939,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001804878048780488,
+      "loss": 0.07201241701841354,
+      "mean_token_accuracy": 0.9769125580787659,
+      "num_tokens": 122805.0,
+      "step": 17
+    },
+    {
+      "entropy": 0.33192048501223326,
+      "epoch": 0.22052067381316998,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00017926829268292684,
+      "loss": 0.08568219095468521,
+      "mean_token_accuracy": 0.9731468558311462,
+      "num_tokens": 129809.0,
+      "step": 18
+    },
+    {
+      "entropy": 0.38278588838875294,
+      "epoch": 0.2327718223583461,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00017804878048780488,
+      "loss": 0.09338308125734329,
+      "mean_token_accuracy": 0.971602164208889,
+      "num_tokens": 136394.0,
+      "step": 19
+    },
+    {
+      "entropy": 0.3615857381373644,
+      "epoch": 0.2450229709035222,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00017682926829268295,
+      "loss": 0.09092211723327637,
+      "mean_token_accuracy": 0.9741932302713394,
+      "num_tokens": 143441.0,
+      "step": 20
+    },
+    {
+      "entropy": 0.36908658128231764,
+      "epoch": 0.2572741194486983,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.000175609756097561,
+      "loss": 0.08083514869213104,
+      "mean_token_accuracy": 0.9730581529438496,
+      "num_tokens": 150360.0,
+      "step": 21
+    },
+    {
+      "entropy": 0.33506167121231556,
+      "epoch": 0.26952526799387444,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.00017439024390243903,
+      "loss": 0.07429873198270798,
+      "mean_token_accuracy": 0.9798649623990059,
+      "num_tokens": 158201.0,
+      "step": 22
+    },
+    {
+      "entropy": 0.34483147878199816,
+      "epoch": 0.28177641653905056,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00017317073170731708,
+      "loss": 0.097615085542202,
+      "mean_token_accuracy": 0.9703259542584419,
+      "num_tokens": 165103.0,
+      "step": 23
+    },
+    {
+      "entropy": 0.3624556018039584,
+      "epoch": 0.29402756508422667,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00017195121951219512,
+      "loss": 0.09813392162322998,
+      "mean_token_accuracy": 0.971100814640522,
+      "num_tokens": 172845.0,
+      "step": 24
+    },
+    {
+      "entropy": 0.3426302410662174,
+      "epoch": 0.30627871362940273,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001707317073170732,
+      "loss": 0.10315398871898651,
+      "mean_token_accuracy": 0.9727907553315163,
+      "num_tokens": 179310.0,
+      "step": 25
+    },
+    {
+      "entropy": 0.32603410072624683,
+      "epoch": 0.31852986217457885,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00016951219512195123,
+      "loss": 0.08456183224916458,
+      "mean_token_accuracy": 0.9688506498932838,
+      "num_tokens": 186592.0,
+      "step": 26
+    },
+    {
+      "entropy": 0.3102861074730754,
+      "epoch": 0.33078101071975496,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00016829268292682927,
+      "loss": 0.07856184989213943,
+      "mean_token_accuracy": 0.9766327068209648,
+      "num_tokens": 193170.0,
+      "step": 27
+    },
+    {
+      "entropy": 0.32588900811970234,
+      "epoch": 0.3430321592649311,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00016707317073170731,
+      "loss": 0.07468702644109726,
+      "mean_token_accuracy": 0.9745533689856529,
+      "num_tokens": 200237.0,
+      "step": 28
+    },
+    {
+      "entropy": 0.36062031611800194,
+      "epoch": 0.3552833078101072,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.00016585365853658536,
+      "loss": 0.07607633620500565,
+      "mean_token_accuracy": 0.9738306701183319,
+      "num_tokens": 206965.0,
+      "step": 29
+    },
+    {
+      "entropy": 0.3694803323596716,
+      "epoch": 0.3675344563552833,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.00016463414634146343,
+      "loss": 0.07808130979537964,
+      "mean_token_accuracy": 0.9752037785947323,
+      "num_tokens": 214253.0,
+      "step": 30
+    },
+    {
+      "entropy": 0.34707722812891006,
+      "epoch": 0.37978560490045943,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.00016341463414634147,
+      "loss": 0.09708584100008011,
+      "mean_token_accuracy": 0.968984991312027,
+      "num_tokens": 219669.0,
+      "step": 31
+    },
+    {
+      "entropy": 0.34607914835214615,
+      "epoch": 0.39203675344563554,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.00016219512195121954,
+      "loss": 0.0862056165933609,
+      "mean_token_accuracy": 0.975422702729702,
+      "num_tokens": 227923.0,
+      "step": 32
+    },
+    {
+      "entropy": 0.3694024868309498,
+      "epoch": 0.40428790199081166,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.00016097560975609758,
+      "loss": 0.07349187880754471,
+      "mean_token_accuracy": 0.9769446365535259,
+      "num_tokens": 234412.0,
+      "step": 33
+    },
+    {
+      "entropy": 0.3105186866596341,
+      "epoch": 0.4165390505359878,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00015975609756097562,
+      "loss": 0.092418372631073,
+      "mean_token_accuracy": 0.9768914133310318,
+      "num_tokens": 241587.0,
+      "step": 34
+    },
+    {
+      "entropy": 0.29616252705454826,
+      "epoch": 0.42879019908116384,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.00015853658536585366,
+      "loss": 0.06796497106552124,
+      "mean_token_accuracy": 0.9777620621025562,
+      "num_tokens": 249133.0,
+      "step": 35
+    },
+    {
+      "entropy": 0.32076778169721365,
+      "epoch": 0.44104134762633995,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.00015731707317073173,
+      "loss": 0.06996235251426697,
+      "mean_token_accuracy": 0.9789225980639458,
+      "num_tokens": 255805.0,
+      "step": 36
+    },
+    {
+      "entropy": 0.277026129886508,
+      "epoch": 0.45329249617151607,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.00015609756097560978,
+      "loss": 0.06752780079841614,
+      "mean_token_accuracy": 0.9788497537374496,
+      "num_tokens": 262962.0,
+      "step": 37
+    },
+    {
+      "entropy": 0.3057912113144994,
+      "epoch": 0.4655436447166922,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00015487804878048782,
+      "loss": 0.07604770362377167,
+      "mean_token_accuracy": 0.9693816341459751,
+      "num_tokens": 268812.0,
+      "step": 38
+    },
+    {
+      "entropy": 0.2962911752983928,
+      "epoch": 0.4777947932618683,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00015365853658536586,
+      "loss": 0.08501230180263519,
+      "mean_token_accuracy": 0.969777375459671,
+      "num_tokens": 275542.0,
+      "step": 39
+    },
+    {
+      "entropy": 0.29683656617999077,
+      "epoch": 0.4900459418070444,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001524390243902439,
+      "loss": 0.07492107152938843,
+      "mean_token_accuracy": 0.9751906730234623,
+      "num_tokens": 282460.0,
+      "step": 40
+    },
+    {
+      "entropy": 0.29800107702612877,
+      "epoch": 0.5022970903522205,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00015121951219512197,
+      "loss": 0.07613345235586166,
+      "mean_token_accuracy": 0.9772711955010891,
+      "num_tokens": 287662.0,
+      "step": 41
+    },
+    {
+      "entropy": 0.287130375392735,
+      "epoch": 0.5145482388973966,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.07252758741378784,
+      "mean_token_accuracy": 0.9758713953197002,
+      "num_tokens": 294018.0,
+      "step": 42
+    },
+    {
+      "entropy": 0.31403718050569296,
+      "epoch": 0.5267993874425727,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.00014878048780487806,
+      "loss": 0.07692913711071014,
+      "mean_token_accuracy": 0.9735661074519157,
+      "num_tokens": 301127.0,
+      "step": 43
+    },
+    {
+      "entropy": 0.32532981038093567,
+      "epoch": 0.5390505359877489,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001475609756097561,
+      "loss": 0.06646531820297241,
+      "mean_token_accuracy": 0.9817029014229774,
+      "num_tokens": 307985.0,
+      "step": 44
+    },
+    {
+      "entropy": 0.29122561775147915,
+      "epoch": 0.5513016845329249,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.00014634146341463414,
+      "loss": 0.08177122473716736,
+      "mean_token_accuracy": 0.9714771695435047,
+      "num_tokens": 314911.0,
+      "step": 45
+    },
+    {
+      "entropy": 0.31051622424274683,
+      "epoch": 0.5635528330781011,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001451219512195122,
+      "loss": 0.08793994784355164,
+      "mean_token_accuracy": 0.9757628589868546,
+      "num_tokens": 321376.0,
+      "step": 46
+    },
+    {
+      "entropy": 0.31927434727549553,
+      "epoch": 0.5758039816232772,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00014390243902439025,
+      "loss": 0.07368139922618866,
+      "mean_token_accuracy": 0.9764612726867199,
+      "num_tokens": 327751.0,
+      "step": 47
+    },
+    {
+      "entropy": 0.31811847630888224,
+      "epoch": 0.5880551301684533,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001426829268292683,
+      "loss": 0.07313578575849533,
+      "mean_token_accuracy": 0.9781395755708218,
+      "num_tokens": 333164.0,
+      "step": 48
+    },
+    {
+      "entropy": 0.33736035507172346,
+      "epoch": 0.6003062787136294,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.00014146341463414634,
+      "loss": 0.06882140785455704,
+      "mean_token_accuracy": 0.9774826094508171,
+      "num_tokens": 339755.0,
+      "step": 49
+    },
+    {
+      "entropy": 0.31375813484191895,
+      "epoch": 0.6125574272588055,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.00014024390243902438,
+      "loss": 0.07694698125123978,
+      "mean_token_accuracy": 0.9765506535768509,
+      "num_tokens": 346465.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.6125574272588055,
+      "eval_entropy": 0.3228047593780186,
+      "eval_loss": 0.06397496163845062,
+      "eval_mean_token_accuracy": 0.9778102247611337,
+      "eval_num_tokens": 346465.0,
+      "eval_runtime": 61.6139,
+      "eval_samples_per_second": 1.12,
+      "eval_steps_per_second": 1.12,
+      "step": 50
+    },
+    {
+      "entropy": 0.33666181843727827,
+      "epoch": 0.6248085758039816,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00013902439024390245,
+      "loss": 0.07857983559370041,
+      "mean_token_accuracy": 0.9755821079015732,
+      "num_tokens": 352546.0,
+      "step": 51
+    },
+    {
+      "entropy": 0.3325382797047496,
+      "epoch": 0.6370597243491577,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001378048780487805,
+      "loss": 0.055708881467580795,
+      "mean_token_accuracy": 0.9814197011291981,
+      "num_tokens": 358950.0,
+      "step": 52
+    },
+    {
+      "entropy": 0.32812063954770565,
+      "epoch": 0.6493108728943339,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.00013658536585365856,
+      "loss": 0.07367786020040512,
+      "mean_token_accuracy": 0.9777578823268414,
+      "num_tokens": 366288.0,
+      "step": 53
+    },
+    {
+      "entropy": 0.2817836347967386,
+      "epoch": 0.6615620214395099,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001353658536585366,
+      "loss": 0.0578315332531929,
+      "mean_token_accuracy": 0.9810350127518177,
+      "num_tokens": 372898.0,
+      "step": 54
+    },
+    {
+      "entropy": 0.32251426950097084,
+      "epoch": 0.6738131699846861,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.00013414634146341464,
+      "loss": 0.06909041851758957,
+      "mean_token_accuracy": 0.9752591475844383,
+      "num_tokens": 380158.0,
+      "step": 55
+    },
+    {
+      "entropy": 0.32180769462138414,
+      "epoch": 0.6860643185298622,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001329268292682927,
+      "loss": 0.07875123620033264,
+      "mean_token_accuracy": 0.9738615117967129,
+      "num_tokens": 386269.0,
+      "step": 56
+    },
+    {
+      "entropy": 0.307281319051981,
+      "epoch": 0.6983154670750383,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00013170731707317076,
+      "loss": 0.07773981243371964,
+      "mean_token_accuracy": 0.9736250452697277,
+      "num_tokens": 394382.0,
+      "step": 57
+    },
+    {
+      "entropy": 0.29658956825733185,
+      "epoch": 0.7105666156202144,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001304878048780488,
+      "loss": 0.05800808221101761,
+      "mean_token_accuracy": 0.9793435782194138,
+      "num_tokens": 401510.0,
+      "step": 58
+    },
+    {
+      "entropy": 0.34156549349427223,
+      "epoch": 0.7228177641653905,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.00012926829268292684,
+      "loss": 0.07996754348278046,
+      "mean_token_accuracy": 0.9773447252810001,
+      "num_tokens": 407257.0,
+      "step": 59
+    },
+    {
+      "entropy": 0.3153263973072171,
+      "epoch": 0.7350689127105666,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.00012804878048780488,
+      "loss": 0.07675991952419281,
+      "mean_token_accuracy": 0.9762718975543976,
+      "num_tokens": 413752.0,
+      "step": 60
+    },
+    {
+      "entropy": 0.28962062764912844,
+      "epoch": 0.7473200612557427,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.00012682926829268293,
+      "loss": 0.08181046694517136,
+      "mean_token_accuracy": 0.9704542681574821,
+      "num_tokens": 421184.0,
+      "step": 61
+    },
+    {
+      "entropy": 0.3103124424815178,
+      "epoch": 0.7595712098009189,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.000125609756097561,
+      "loss": 0.07483275234699249,
+      "mean_token_accuracy": 0.9754621051251888,
+      "num_tokens": 428988.0,
+      "step": 62
+    },
+    {
+      "entropy": 0.27847875375300646,
+      "epoch": 0.7718223583460949,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.00012439024390243904,
+      "loss": 0.06930931657552719,
+      "mean_token_accuracy": 0.9753326959908009,
+      "num_tokens": 436040.0,
+      "step": 63
+    },
+    {
+      "entropy": 0.2668089345097542,
+      "epoch": 0.7840735068912711,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.00012317073170731708,
+      "loss": 0.06888818740844727,
+      "mean_token_accuracy": 0.9757619500160217,
+      "num_tokens": 442659.0,
+      "step": 64
+    },
+    {
+      "entropy": 0.3198395315557718,
+      "epoch": 0.7963246554364471,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.00012195121951219512,
+      "loss": 0.05557462200522423,
+      "mean_token_accuracy": 0.984383974224329,
+      "num_tokens": 449165.0,
+      "step": 65
+    },
+    {
+      "entropy": 0.30934575013816357,
+      "epoch": 0.8085758039816233,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00012073170731707318,
+      "loss": 0.06038631126284599,
+      "mean_token_accuracy": 0.98052117228508,
+      "num_tokens": 456668.0,
+      "step": 66
+    },
+    {
+      "entropy": 0.30664732959121466,
+      "epoch": 0.8208269525267994,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.00011951219512195122,
+      "loss": 0.06068733334541321,
+      "mean_token_accuracy": 0.9787427373230457,
+      "num_tokens": 463055.0,
+      "step": 67
+    },
+    {
+      "entropy": 0.32763790991157293,
+      "epoch": 0.8330781010719756,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.00011829268292682926,
+      "loss": 0.06245455890893936,
+      "mean_token_accuracy": 0.977917030453682,
+      "num_tokens": 469027.0,
+      "step": 68
+    },
+    {
+      "entropy": 0.30262293945997953,
+      "epoch": 0.8453292496171516,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00011707317073170732,
+      "loss": 0.08869794756174088,
+      "mean_token_accuracy": 0.9743811637163162,
+      "num_tokens": 474692.0,
+      "step": 69
+    },
+    {
+      "entropy": 0.3196303751319647,
+      "epoch": 0.8575803981623277,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00011585365853658536,
+      "loss": 0.09405583888292313,
+      "mean_token_accuracy": 0.9726175181567669,
+      "num_tokens": 480966.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.35709446109831333,
+      "epoch": 0.8698315467075038,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00011463414634146342,
+      "loss": 0.06765145808458328,
+      "mean_token_accuracy": 0.9785624854266644,
+      "num_tokens": 487243.0,
+      "step": 71
+    },
+    {
+      "entropy": 0.3187448363751173,
+      "epoch": 0.8820826952526799,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00011341463414634146,
+      "loss": 0.11273079365491867,
+      "mean_token_accuracy": 0.968480296432972,
+      "num_tokens": 493644.0,
+      "step": 72
+    },
+    {
+      "entropy": 0.338912196457386,
+      "epoch": 0.8943338437978561,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00011219512195121953,
+      "loss": 0.07907245308160782,
+      "mean_token_accuracy": 0.9740516655147076,
+      "num_tokens": 501266.0,
+      "step": 73
+    },
+    {
+      "entropy": 0.32440096139907837,
+      "epoch": 0.9065849923430321,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.00011097560975609757,
+      "loss": 0.05401031672954559,
+      "mean_token_accuracy": 0.9804942272603512,
+      "num_tokens": 508602.0,
+      "step": 74
+    },
+    {
+      "entropy": 0.3483101995661855,
+      "epoch": 0.9188361408882083,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00010975609756097563,
+      "loss": 0.11057186871767044,
+      "mean_token_accuracy": 0.9718486294150352,
+      "num_tokens": 515398.0,
+      "step": 75
+    },
+    {
+      "entropy": 0.34578234050422907,
+      "epoch": 0.9310872894333844,
+      "grad_norm": 0.125,
+      "learning_rate": 0.00010853658536585367,
+      "loss": 0.06668201088905334,
+      "mean_token_accuracy": 0.9788769222795963,
+      "num_tokens": 521843.0,
+      "step": 76
+    },
+    {
+      "entropy": 0.30221153143793344,
+      "epoch": 0.9433384379785605,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.00010731707317073172,
+      "loss": 0.06314530223608017,
+      "mean_token_accuracy": 0.9795791730284691,
+      "num_tokens": 529048.0,
+      "step": 77
+    },
+    {
+      "entropy": 0.3246979024261236,
+      "epoch": 0.9555895865237366,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.00010609756097560977,
+      "loss": 0.06752847880125046,
+      "mean_token_accuracy": 0.9832794591784477,
+      "num_tokens": 536898.0,
+      "step": 78
+    },
+    {
+      "entropy": 0.31460305489599705,
+      "epoch": 0.9678407350689127,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00010487804878048781,
+      "loss": 0.07550302147865295,
+      "mean_token_accuracy": 0.9745542369782925,
+      "num_tokens": 543183.0,
+      "step": 79
+    },
+    {
+      "entropy": 0.29604928102344275,
+      "epoch": 0.9800918836140888,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00010365853658536586,
+      "loss": 0.061620082706213,
+      "mean_token_accuracy": 0.9814895763993263,
+      "num_tokens": 550826.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.309127070941031,
+      "epoch": 0.9923430321592649,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001024390243902439,
+      "loss": 0.08857648074626923,
+      "mean_token_accuracy": 0.9735055603086948,
+      "num_tokens": 557586.0,
+      "step": 81
+    },
+    {
+      "entropy": 0.28887457996606825,
+      "epoch": 1.0,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00010121951219512196,
+      "loss": 0.06288491934537888,
+      "mean_token_accuracy": 0.9819431960582733,
+      "num_tokens": 561550.0,
+      "step": 82
+    },
+    {
+      "entropy": 0.3049680180847645,
+      "epoch": 1.0122511485451762,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.08012041449546814,
+      "mean_token_accuracy": 0.9773442409932613,
+      "num_tokens": 568728.0,
+      "step": 83
+    },
+    {
+      "entropy": 0.26021771878004074,
+      "epoch": 1.0245022970903521,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 9.878048780487805e-05,
+      "loss": 0.04987334832549095,
+      "mean_token_accuracy": 0.9872907213866711,
+      "num_tokens": 575388.0,
+      "step": 84
+    },
+    {
+      "entropy": 0.3132081003859639,
+      "epoch": 1.0367534456355283,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 9.75609756097561e-05,
+      "loss": 0.03793306648731232,
+      "mean_token_accuracy": 0.9893307648599148,
+      "num_tokens": 583555.0,
+      "step": 85
+    },
+    {
+      "entropy": 0.2535849278792739,
+      "epoch": 1.0490045941807045,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 9.634146341463415e-05,
+      "loss": 0.04893108084797859,
+      "mean_token_accuracy": 0.9859805703163147,
+      "num_tokens": 589967.0,
+      "step": 86
+    },
+    {
+      "entropy": 0.23509769327938557,
+      "epoch": 1.0612557427258806,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 9.51219512195122e-05,
+      "loss": 0.0479637086391449,
+      "mean_token_accuracy": 0.9847861491143703,
+      "num_tokens": 598042.0,
+      "step": 87
+    },
+    {
+      "entropy": 0.30080515146255493,
+      "epoch": 1.0735068912710566,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 9.390243902439024e-05,
+      "loss": 0.08284764736890793,
+      "mean_token_accuracy": 0.9821349158883095,
+      "num_tokens": 604464.0,
+      "step": 88
+    },
+    {
+      "entropy": 0.2792583117261529,
+      "epoch": 1.0857580398162328,
+      "grad_norm": 0.099609375,
+      "learning_rate": 9.26829268292683e-05,
+      "loss": 0.04615325853228569,
+      "mean_token_accuracy": 0.9861109368503094,
+      "num_tokens": 610667.0,
+      "step": 89
+    },
+    {
+      "entropy": 0.2520109824836254,
+      "epoch": 1.098009188361409,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 9.146341463414635e-05,
+      "loss": 0.051208432763814926,
+      "mean_token_accuracy": 0.9884813874959946,
+      "num_tokens": 617684.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.2811925411224365,
+      "epoch": 1.110260336906585,
+      "grad_norm": 0.10546875,
+      "learning_rate": 9.02439024390244e-05,
+      "loss": 0.045838613063097,
+      "mean_token_accuracy": 0.9813198745250702,
+      "num_tokens": 624400.0,
+      "step": 91
+    },
+    {
+      "entropy": 0.24536922946572304,
+      "epoch": 1.122511485451761,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 8.902439024390244e-05,
+      "loss": 0.04839828237891197,
+      "mean_token_accuracy": 0.9828252196311951,
+      "num_tokens": 631186.0,
+      "step": 92
+    },
+    {
+      "entropy": 0.22986232955008745,
+      "epoch": 1.1347626339969372,
+      "grad_norm": 0.111328125,
+      "learning_rate": 8.78048780487805e-05,
+      "loss": 0.04496093466877937,
+      "mean_token_accuracy": 0.9868007227778435,
+      "num_tokens": 638230.0,
+      "step": 93
+    },
+    {
+      "entropy": 0.24173379130661488,
+      "epoch": 1.1470137825421134,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 8.658536585365854e-05,
+      "loss": 0.038931019604206085,
+      "mean_token_accuracy": 0.9905626736581326,
+      "num_tokens": 644599.0,
+      "step": 94
+    },
+    {
+      "entropy": 0.22757203690707684,
+      "epoch": 1.1592649310872893,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 8.53658536585366e-05,
+      "loss": 0.03816759213805199,
+      "mean_token_accuracy": 0.9886907860636711,
+      "num_tokens": 651211.0,
+      "step": 95
+    },
+    {
+      "entropy": 0.22881112340837717,
+      "epoch": 1.1715160796324655,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 8.414634146341464e-05,
+      "loss": 0.0442253053188324,
+      "mean_token_accuracy": 0.9876982606947422,
+      "num_tokens": 657240.0,
+      "step": 96
+    },
+    {
+      "entropy": 0.23342516459524632,
+      "epoch": 1.1837672281776417,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 8.292682926829268e-05,
+      "loss": 0.0382809042930603,
+      "mean_token_accuracy": 0.9902381077408791,
+      "num_tokens": 663894.0,
+      "step": 97
+    },
+    {
+      "entropy": 0.2406658846884966,
+      "epoch": 1.1960183767228179,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 8.170731707317073e-05,
+      "loss": 0.04963719844818115,
+      "mean_token_accuracy": 0.9877509213984013,
+      "num_tokens": 671338.0,
+      "step": 98
+    },
+    {
+      "entropy": 0.272550736553967,
+      "epoch": 1.2082695252679938,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 8.048780487804879e-05,
+      "loss": 0.05441854149103165,
+      "mean_token_accuracy": 0.983638808131218,
+      "num_tokens": 679108.0,
+      "step": 99
+    },
+    {
+      "entropy": 0.2297075460664928,
+      "epoch": 1.22052067381317,
+      "grad_norm": 0.142578125,
+      "learning_rate": 7.926829268292683e-05,
+      "loss": 0.04290780425071716,
+      "mean_token_accuracy": 0.9863294512033463,
+      "num_tokens": 685572.0,
+      "step": 100
+    },
+    {
+      "epoch": 1.22052067381317,
+      "eval_entropy": 0.2541272067937298,
+      "eval_loss": 0.06591593474149704,
+      "eval_mean_token_accuracy": 0.9771205471909564,
+      "eval_num_tokens": 685572.0,
+      "eval_runtime": 61.7088,
+      "eval_samples_per_second": 1.118,
+      "eval_steps_per_second": 1.118,
+      "step": 100
+    },
+    {
+      "entropy": 0.25287739746272564,
+      "epoch": 1.2327718223583461,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 7.804878048780489e-05,
+      "loss": 0.03635338693857193,
+      "mean_token_accuracy": 0.9891670718789101,
+      "num_tokens": 692888.0,
+      "step": 101
+    },
+    {
+      "entropy": 0.23323721811175346,
+      "epoch": 1.245022970903522,
+      "grad_norm": 0.091796875,
+      "learning_rate": 7.682926829268293e-05,
+      "loss": 0.03589210659265518,
+      "mean_token_accuracy": 0.9888128153979778,
+      "num_tokens": 700015.0,
+      "step": 102
+    },
+    {
+      "entropy": 0.24865772109478712,
+      "epoch": 1.2572741194486983,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 7.560975609756099e-05,
+      "loss": 0.0306346844881773,
+      "mean_token_accuracy": 0.9904107972979546,
+      "num_tokens": 707330.0,
+      "step": 103
+    },
+    {
+      "entropy": 0.2551306625828147,
+      "epoch": 1.2695252679938744,
+      "grad_norm": 0.134765625,
+      "learning_rate": 7.439024390243903e-05,
+      "loss": 0.062116604298353195,
+      "mean_token_accuracy": 0.9804923608899117,
+      "num_tokens": 714185.0,
+      "step": 104
+    },
+    {
+      "entropy": 0.26964253932237625,
+      "epoch": 1.2817764165390506,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 7.317073170731707e-05,
+      "loss": 0.05495516210794449,
+      "mean_token_accuracy": 0.983124740421772,
+      "num_tokens": 720030.0,
+      "step": 105
+    },
+    {
+      "entropy": 0.2399951433762908,
+      "epoch": 1.2940275650842268,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 7.195121951219513e-05,
+      "loss": 0.0537109449505806,
+      "mean_token_accuracy": 0.9835550487041473,
+      "num_tokens": 727399.0,
+      "step": 106
+    },
+    {
+      "entropy": 0.2527354145422578,
+      "epoch": 1.3062787136294027,
+      "grad_norm": 0.095703125,
+      "learning_rate": 7.073170731707317e-05,
+      "loss": 0.0370502732694149,
+      "mean_token_accuracy": 0.987937405705452,
+      "num_tokens": 734117.0,
+      "step": 107
+    },
+    {
+      "entropy": 0.2708861446008086,
+      "epoch": 1.318529862174579,
+      "grad_norm": 0.1328125,
+      "learning_rate": 6.951219512195122e-05,
+      "loss": 0.02873871847987175,
+      "mean_token_accuracy": 0.9898889176547527,
+      "num_tokens": 739843.0,
+      "step": 108
+    },
+    {
+      "entropy": 0.24508011247962713,
+      "epoch": 1.3307810107197549,
+      "grad_norm": 0.142578125,
+      "learning_rate": 6.829268292682928e-05,
+      "loss": 0.04992092400789261,
+      "mean_token_accuracy": 0.985265351831913,
+      "num_tokens": 747120.0,
+      "step": 109
+    },
+    {
+      "entropy": 0.24371083825826645,
+      "epoch": 1.343032159264931,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 6.707317073170732e-05,
+      "loss": 0.04801744595170021,
+      "mean_token_accuracy": 0.9857638366520405,
+      "num_tokens": 754708.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.24870567955076694,
+      "epoch": 1.3552833078101072,
+      "grad_norm": 0.12109375,
+      "learning_rate": 6.585365853658538e-05,
+      "loss": 0.04191367328166962,
+      "mean_token_accuracy": 0.9862709790468216,
+      "num_tokens": 760898.0,
+      "step": 111
+    },
+    {
+      "entropy": 0.2612510984763503,
+      "epoch": 1.3675344563552834,
+      "grad_norm": 0.103515625,
+      "learning_rate": 6.463414634146342e-05,
+      "loss": 0.05326298251748085,
+      "mean_token_accuracy": 0.9800548776984215,
+      "num_tokens": 767451.0,
+      "step": 112
+    },
+    {
+      "entropy": 0.2545614130795002,
+      "epoch": 1.3797856049004595,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 6.341463414634146e-05,
+      "loss": 0.040731754153966904,
+      "mean_token_accuracy": 0.9870966114103794,
+      "num_tokens": 775160.0,
+      "step": 113
+    },
+    {
+      "entropy": 0.24170197267085314,
+      "epoch": 1.3920367534456355,
+      "grad_norm": 0.1328125,
+      "learning_rate": 6.219512195121952e-05,
+      "loss": 0.040866196155548096,
+      "mean_token_accuracy": 0.9861873909831047,
+      "num_tokens": 782453.0,
+      "step": 114
+    },
+    {
+      "entropy": 0.26318133249878883,
+      "epoch": 1.4042879019908117,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 6.097560975609756e-05,
+      "loss": 0.03041079081594944,
+      "mean_token_accuracy": 0.989543404430151,
+      "num_tokens": 789405.0,
+      "step": 115
+    },
+    {
+      "entropy": 0.2495063878595829,
+      "epoch": 1.4165390505359878,
+      "grad_norm": 0.12109375,
+      "learning_rate": 5.975609756097561e-05,
+      "loss": 0.03743698075413704,
+      "mean_token_accuracy": 0.9890936017036438,
+      "num_tokens": 795520.0,
+      "step": 116
+    },
+    {
+      "entropy": 0.23341670911759138,
+      "epoch": 1.4287901990811638,
+      "grad_norm": 0.11328125,
+      "learning_rate": 5.853658536585366e-05,
+      "loss": 0.052408114075660706,
+      "mean_token_accuracy": 0.9838856235146523,
+      "num_tokens": 802060.0,
+      "step": 117
+    },
+    {
+      "entropy": 0.2609324613586068,
+      "epoch": 1.44104134762634,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 5.731707317073171e-05,
+      "loss": 0.04312039911746979,
+      "mean_token_accuracy": 0.985971175134182,
+      "num_tokens": 808902.0,
+      "step": 118
+    },
+    {
+      "entropy": 0.2670856877230108,
+      "epoch": 1.4532924961715161,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 5.6097560975609764e-05,
+      "loss": 0.045985687524080276,
+      "mean_token_accuracy": 0.9871717020869255,
+      "num_tokens": 816206.0,
+      "step": 119
+    },
+    {
+      "entropy": 0.26661152858287096,
+      "epoch": 1.4655436447166923,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 5.487804878048781e-05,
+      "loss": 0.04714817553758621,
+      "mean_token_accuracy": 0.9872265867888927,
+      "num_tokens": 822262.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.23576788790524006,
+      "epoch": 1.4777947932618682,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 5.365853658536586e-05,
+      "loss": 0.042331770062446594,
+      "mean_token_accuracy": 0.9874267354607582,
+      "num_tokens": 828344.0,
+      "step": 121
+    },
+    {
+      "entropy": 0.23678721394389868,
+      "epoch": 1.4900459418070444,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 5.2439024390243904e-05,
+      "loss": 0.04589102789759636,
+      "mean_token_accuracy": 0.983386930078268,
+      "num_tokens": 835172.0,
+      "step": 122
+    },
+    {
+      "entropy": 0.2346462979912758,
+      "epoch": 1.5022970903522204,
+      "grad_norm": 0.150390625,
+      "learning_rate": 5.121951219512195e-05,
+      "loss": 0.04002859443426132,
+      "mean_token_accuracy": 0.9853599444031715,
+      "num_tokens": 841225.0,
+      "step": 123
+    },
+    {
+      "entropy": 0.23265949822962284,
+      "epoch": 1.5145482388973965,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 5e-05,
+      "loss": 0.05799969285726547,
+      "mean_token_accuracy": 0.9821320809423923,
+      "num_tokens": 849059.0,
+      "step": 124
+    },
+    {
+      "entropy": 0.24194937106221914,
+      "epoch": 1.5267993874425727,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 4.878048780487805e-05,
+      "loss": 0.05711432918906212,
+      "mean_token_accuracy": 0.9829593598842621,
+      "num_tokens": 857164.0,
+      "step": 125
+    },
+    {
+      "entropy": 0.23494063317775726,
+      "epoch": 1.5390505359877489,
+      "grad_norm": 0.13671875,
+      "learning_rate": 4.75609756097561e-05,
+      "loss": 0.06452606618404388,
+      "mean_token_accuracy": 0.9841964319348335,
+      "num_tokens": 864463.0,
+      "step": 126
+    },
+    {
+      "entropy": 0.2896596472710371,
+      "epoch": 1.551301684532925,
+      "grad_norm": 0.125,
+      "learning_rate": 4.634146341463415e-05,
+      "loss": 0.05577927827835083,
+      "mean_token_accuracy": 0.9853286854922771,
+      "num_tokens": 871759.0,
+      "step": 127
+    },
+    {
+      "entropy": 0.2581626093015075,
+      "epoch": 1.5635528330781012,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 4.51219512195122e-05,
+      "loss": 0.03707893565297127,
+      "mean_token_accuracy": 0.9873116910457611,
+      "num_tokens": 878778.0,
+      "step": 128
+    },
+    {
+      "entropy": 0.22061639744788408,
+      "epoch": 1.5758039816232772,
+      "grad_norm": 0.130859375,
+      "learning_rate": 4.390243902439025e-05,
+      "loss": 0.03345699608325958,
+      "mean_token_accuracy": 0.9894634075462818,
+      "num_tokens": 884824.0,
+      "step": 129
+    },
+    {
+      "entropy": 0.22001465130597353,
+      "epoch": 1.5880551301684533,
+      "grad_norm": 0.115234375,
+      "learning_rate": 4.26829268292683e-05,
+      "loss": 0.033830057829618454,
+      "mean_token_accuracy": 0.9893862381577492,
+      "num_tokens": 891884.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.24686648417264223,
+      "epoch": 1.6003062787136293,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 4.146341463414634e-05,
+      "loss": 0.03980773314833641,
+      "mean_token_accuracy": 0.9880322627723217,
+      "num_tokens": 898482.0,
+      "step": 131
+    },
+    {
+      "entropy": 0.24617432616651058,
+      "epoch": 1.6125574272588055,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.0243902439024395e-05,
+      "loss": 0.05706659331917763,
+      "mean_token_accuracy": 0.9778756387531757,
+      "num_tokens": 905832.0,
+      "step": 132
+    },
+    {
+      "entropy": 0.23000300489366055,
+      "epoch": 1.6248085758039816,
+      "grad_norm": 0.11328125,
+      "learning_rate": 3.9024390243902444e-05,
+      "loss": 0.04501105844974518,
+      "mean_token_accuracy": 0.9868872575461864,
+      "num_tokens": 912020.0,
+      "step": 133
+    },
+    {
+      "entropy": 0.23596840538084507,
+      "epoch": 1.6370597243491578,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 3.780487804878049e-05,
+      "loss": 0.057706866413354874,
+      "mean_token_accuracy": 0.9847077578306198,
+      "num_tokens": 918405.0,
+      "step": 134
+    },
+    {
+      "entropy": 0.25483656115829945,
+      "epoch": 1.649310872894334,
+      "grad_norm": 0.13671875,
+      "learning_rate": 3.6585365853658535e-05,
+      "loss": 0.06121227145195007,
+      "mean_token_accuracy": 0.9820427037775517,
+      "num_tokens": 925425.0,
+      "step": 135
+    },
+    {
+      "entropy": 0.2879916010424495,
+      "epoch": 1.66156202143951,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 3.5365853658536584e-05,
+      "loss": 0.03920814394950867,
+      "mean_token_accuracy": 0.9877970777451992,
+      "num_tokens": 930895.0,
+      "step": 136
+    },
+    {
+      "entropy": 0.2634068289771676,
+      "epoch": 1.673813169984686,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 3.414634146341464e-05,
+      "loss": 0.0386468879878521,
+      "mean_token_accuracy": 0.9877878725528717,
+      "num_tokens": 937389.0,
+      "step": 137
+    },
+    {
+      "entropy": 0.2587460493668914,
+      "epoch": 1.686064318529862,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 3.292682926829269e-05,
+      "loss": 0.034930720925331116,
+      "mean_token_accuracy": 0.9874115809798241,
+      "num_tokens": 945016.0,
+      "step": 138
+    },
+    {
+      "entropy": 0.26116301491856575,
+      "epoch": 1.6983154670750382,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 3.170731707317073e-05,
+      "loss": 0.033916398882865906,
+      "mean_token_accuracy": 0.9879581108689308,
+      "num_tokens": 951475.0,
+      "step": 139
+    },
+    {
+      "entropy": 0.2920989394187927,
+      "epoch": 1.7105666156202144,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 3.048780487804878e-05,
+      "loss": 0.03983481973409653,
+      "mean_token_accuracy": 0.9893253818154335,
+      "num_tokens": 957845.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.23128231521695852,
+      "epoch": 1.7228177641653906,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 2.926829268292683e-05,
+      "loss": 0.051980044692754745,
+      "mean_token_accuracy": 0.9810708239674568,
+      "num_tokens": 964800.0,
+      "step": 141
+    },
+    {
+      "entropy": 0.25590753462165594,
+      "epoch": 1.7350689127105667,
+      "grad_norm": 0.171875,
+      "learning_rate": 2.8048780487804882e-05,
+      "loss": 0.04797530174255371,
+      "mean_token_accuracy": 0.98531124740839,
+      "num_tokens": 970216.0,
+      "step": 142
+    },
+    {
+      "entropy": 0.24975391291081905,
+      "epoch": 1.7473200612557427,
+      "grad_norm": 0.125,
+      "learning_rate": 2.682926829268293e-05,
+      "loss": 0.04290381073951721,
+      "mean_token_accuracy": 0.9855599775910378,
+      "num_tokens": 976676.0,
+      "step": 143
+    },
+    {
+      "entropy": 0.2620975775644183,
+      "epoch": 1.7595712098009189,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 2.5609756097560977e-05,
+      "loss": 0.04051404073834419,
+      "mean_token_accuracy": 0.9880265817046165,
+      "num_tokens": 984515.0,
+      "step": 144
+    },
+    {
+      "entropy": 0.2606674963608384,
+      "epoch": 1.7718223583460948,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 2.4390243902439026e-05,
+      "loss": 0.040870461612939835,
+      "mean_token_accuracy": 0.9857275933027267,
+      "num_tokens": 991262.0,
+      "step": 145
+    },
+    {
+      "entropy": 0.28423305694013834,
+      "epoch": 1.784073506891271,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 2.3170731707317075e-05,
+      "loss": 0.033665239810943604,
+      "mean_token_accuracy": 0.9889882653951645,
+      "num_tokens": 997598.0,
+      "step": 146
+    },
+    {
+      "entropy": 0.2581148808822036,
+      "epoch": 1.7963246554364471,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 2.1951219512195124e-05,
+      "loss": 0.04215285927057266,
+      "mean_token_accuracy": 0.985553503036499,
+      "num_tokens": 1005517.0,
+      "step": 147
+    },
+    {
+      "entropy": 0.27665852196514606,
+      "epoch": 1.8085758039816233,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 2.073170731707317e-05,
+      "loss": 0.036060549318790436,
+      "mean_token_accuracy": 0.9891492277383804,
+      "num_tokens": 1012208.0,
+      "step": 148
+    },
+    {
+      "entropy": 0.2916540242731571,
+      "epoch": 1.8208269525267995,
+      "grad_norm": 0.111328125,
+      "learning_rate": 1.9512195121951222e-05,
+      "loss": 0.03392140194773674,
+      "mean_token_accuracy": 0.9888332188129425,
+      "num_tokens": 1018126.0,
+      "step": 149
+    },
+    {
+      "entropy": 0.2635590499266982,
+      "epoch": 1.8330781010719757,
+      "grad_norm": 0.107421875,
+      "learning_rate": 1.8292682926829268e-05,
+      "loss": 0.03831010311841965,
+      "mean_token_accuracy": 0.9885501191020012,
+      "num_tokens": 1025618.0,
+      "step": 150
+    },
+    {
+      "epoch": 1.8330781010719757,
+      "eval_entropy": 0.2679487613664157,
+      "eval_loss": 0.06371253728866577,
+      "eval_mean_token_accuracy": 0.9774324289266614,
+      "eval_num_tokens": 1025618.0,
+      "eval_runtime": 61.7245,
+      "eval_samples_per_second": 1.118,
+      "eval_steps_per_second": 1.118,
+      "step": 150
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 164,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.64412202142761e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-150/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f82b2b4e6dbb457dcbeae9dcaee24c59b51d560b24513b0a9e0c20c6b4ff0bda
+size 5585

checkpoint-164/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-164/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-164/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4771c284f33bf9ad924ce4256b47cc6c7144065f0b9859ed6644024d9f16d25
+size 83946192

checkpoint-164/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

checkpoint-164/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d20b82301d3d2a57400ee7dfb5836abaafbd48c39232bd8e0680726a0069f43
+size 85728997

checkpoint-164/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
+size 14645

checkpoint-164/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8706973e007590bfa836db471dff71dd712c2b8887a568e7dc03dcbf8f4f93e7
+size 1465

checkpoint-164/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

checkpoint-164/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}

checkpoint-164/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1707 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 50,
+  "global_step": 164,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 0.5357014574110508,
+      "epoch": 0.01225114854517611,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0002,
+      "loss": 0.2591501772403717,
+      "mean_token_accuracy": 0.9530366845428944,
+      "num_tokens": 7575.0,
+      "step": 1
+    },
+    {
+      "entropy": 0.4969524908810854,
+      "epoch": 0.02450229709035222,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.00019878048780487805,
+      "loss": 0.19432324171066284,
+      "mean_token_accuracy": 0.9566738158464432,
+      "num_tokens": 16374.0,
+      "step": 2
+    },
+    {
+      "entropy": 0.5178094431757927,
+      "epoch": 0.036753445635528334,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001975609756097561,
+      "loss": 0.15099577605724335,
+      "mean_token_accuracy": 0.9616630300879478,
+      "num_tokens": 22935.0,
+      "step": 3
+    },
+    {
+      "entropy": 0.5130382943898439,
+      "epoch": 0.04900459418070444,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00019634146341463416,
+      "loss": 0.12276138365268707,
+      "mean_token_accuracy": 0.967245552688837,
+      "num_tokens": 28404.0,
+      "step": 4
+    },
+    {
+      "entropy": 0.44963230937719345,
+      "epoch": 0.06125574272588055,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001951219512195122,
+      "loss": 0.09725309163331985,
+      "mean_token_accuracy": 0.9740313775837421,
+      "num_tokens": 35549.0,
+      "step": 5
+    },
+    {
+      "entropy": 0.4456397518515587,
+      "epoch": 0.07350689127105667,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00019390243902439025,
+      "loss": 0.1305171251296997,
+      "mean_token_accuracy": 0.9666509628295898,
+      "num_tokens": 42383.0,
+      "step": 6
+    },
+    {
+      "entropy": 0.3939209319651127,
+      "epoch": 0.08575803981623277,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001926829268292683,
+      "loss": 0.09472239017486572,
+      "mean_token_accuracy": 0.974840272217989,
+      "num_tokens": 49995.0,
+      "step": 7
+    },
+    {
+      "entropy": 0.41255801543593407,
+      "epoch": 0.09800918836140889,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.00019146341463414633,
+      "loss": 0.09030990302562714,
+      "mean_token_accuracy": 0.9773416742682457,
+      "num_tokens": 56829.0,
+      "step": 8
+    },
+    {
+      "entropy": 0.41458161920309067,
+      "epoch": 0.11026033690658499,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001902439024390244,
+      "loss": 0.10670135170221329,
+      "mean_token_accuracy": 0.9706939905881882,
+      "num_tokens": 63889.0,
+      "step": 9
+    },
+    {
+      "entropy": 0.38586485385894775,
+      "epoch": 0.1225114854517611,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00018902439024390244,
+      "loss": 0.11325996369123459,
+      "mean_token_accuracy": 0.9668439663946629,
+      "num_tokens": 70927.0,
+      "step": 10
+    },
+    {
+      "entropy": 0.3774546254426241,
+      "epoch": 0.13476263399693722,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001878048780487805,
+      "loss": 0.07076064497232437,
+      "mean_token_accuracy": 0.9780124798417091,
+      "num_tokens": 78426.0,
+      "step": 11
+    },
+    {
+      "entropy": 0.3956366563215852,
+      "epoch": 0.14701378254211334,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.00018658536585365856,
+      "loss": 0.08720126003026962,
+      "mean_token_accuracy": 0.9750143773853779,
+      "num_tokens": 87141.0,
+      "step": 12
+    },
+    {
+      "entropy": 0.37218748684972525,
+      "epoch": 0.15926493108728942,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001853658536585366,
+      "loss": 0.08891582489013672,
+      "mean_token_accuracy": 0.9692744836211205,
+      "num_tokens": 95681.0,
+      "step": 13
+    },
+    {
+      "entropy": 0.3841435620561242,
+      "epoch": 0.17151607963246554,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00018414634146341464,
+      "loss": 0.08403376489877701,
+      "mean_token_accuracy": 0.9721223004162312,
+      "num_tokens": 102341.0,
+      "step": 14
+    },
+    {
+      "entropy": 0.34824744425714016,
+      "epoch": 0.18376722817764166,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001829268292682927,
+      "loss": 0.10026705265045166,
+      "mean_token_accuracy": 0.9667329825460911,
+      "num_tokens": 109665.0,
+      "step": 15
+    },
+    {
+      "entropy": 0.3537330040708184,
+      "epoch": 0.19601837672281777,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00018170731707317075,
+      "loss": 0.06809324026107788,
+      "mean_token_accuracy": 0.9803769923746586,
+      "num_tokens": 116212.0,
+      "step": 16
+    },
+    {
+      "entropy": 0.3518688417971134,
+      "epoch": 0.2082695252679939,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001804878048780488,
+      "loss": 0.07201241701841354,
+      "mean_token_accuracy": 0.9769125580787659,
+      "num_tokens": 122805.0,
+      "step": 17
+    },
+    {
+      "entropy": 0.33192048501223326,
+      "epoch": 0.22052067381316998,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00017926829268292684,
+      "loss": 0.08568219095468521,
+      "mean_token_accuracy": 0.9731468558311462,
+      "num_tokens": 129809.0,
+      "step": 18
+    },
+    {
+      "entropy": 0.38278588838875294,
+      "epoch": 0.2327718223583461,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00017804878048780488,
+      "loss": 0.09338308125734329,
+      "mean_token_accuracy": 0.971602164208889,
+      "num_tokens": 136394.0,
+      "step": 19
+    },
+    {
+      "entropy": 0.3615857381373644,
+      "epoch": 0.2450229709035222,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00017682926829268295,
+      "loss": 0.09092211723327637,
+      "mean_token_accuracy": 0.9741932302713394,
+      "num_tokens": 143441.0,
+      "step": 20
+    },
+    {
+      "entropy": 0.36908658128231764,
+      "epoch": 0.2572741194486983,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.000175609756097561,
+      "loss": 0.08083514869213104,
+      "mean_token_accuracy": 0.9730581529438496,
+      "num_tokens": 150360.0,
+      "step": 21
+    },
+    {
+      "entropy": 0.33506167121231556,
+      "epoch": 0.26952526799387444,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.00017439024390243903,
+      "loss": 0.07429873198270798,
+      "mean_token_accuracy": 0.9798649623990059,
+      "num_tokens": 158201.0,
+      "step": 22
+    },
+    {
+      "entropy": 0.34483147878199816,
+      "epoch": 0.28177641653905056,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00017317073170731708,
+      "loss": 0.097615085542202,
+      "mean_token_accuracy": 0.9703259542584419,
+      "num_tokens": 165103.0,
+      "step": 23
+    },
+    {
+      "entropy": 0.3624556018039584,
+      "epoch": 0.29402756508422667,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00017195121951219512,
+      "loss": 0.09813392162322998,
+      "mean_token_accuracy": 0.971100814640522,
+      "num_tokens": 172845.0,
+      "step": 24
+    },
+    {
+      "entropy": 0.3426302410662174,
+      "epoch": 0.30627871362940273,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001707317073170732,
+      "loss": 0.10315398871898651,
+      "mean_token_accuracy": 0.9727907553315163,
+      "num_tokens": 179310.0,
+      "step": 25
+    },
+    {
+      "entropy": 0.32603410072624683,
+      "epoch": 0.31852986217457885,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00016951219512195123,
+      "loss": 0.08456183224916458,
+      "mean_token_accuracy": 0.9688506498932838,
+      "num_tokens": 186592.0,
+      "step": 26
+    },
+    {
+      "entropy": 0.3102861074730754,
+      "epoch": 0.33078101071975496,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00016829268292682927,
+      "loss": 0.07856184989213943,
+      "mean_token_accuracy": 0.9766327068209648,
+      "num_tokens": 193170.0,
+      "step": 27
+    },
+    {
+      "entropy": 0.32588900811970234,
+      "epoch": 0.3430321592649311,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00016707317073170731,
+      "loss": 0.07468702644109726,
+      "mean_token_accuracy": 0.9745533689856529,
+      "num_tokens": 200237.0,
+      "step": 28
+    },
+    {
+      "entropy": 0.36062031611800194,
+      "epoch": 0.3552833078101072,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.00016585365853658536,
+      "loss": 0.07607633620500565,
+      "mean_token_accuracy": 0.9738306701183319,
+      "num_tokens": 206965.0,
+      "step": 29
+    },
+    {
+      "entropy": 0.3694803323596716,
+      "epoch": 0.3675344563552833,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.00016463414634146343,
+      "loss": 0.07808130979537964,
+      "mean_token_accuracy": 0.9752037785947323,
+      "num_tokens": 214253.0,
+      "step": 30
+    },
+    {
+      "entropy": 0.34707722812891006,
+      "epoch": 0.37978560490045943,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.00016341463414634147,
+      "loss": 0.09708584100008011,
+      "mean_token_accuracy": 0.968984991312027,
+      "num_tokens": 219669.0,
+      "step": 31
+    },
+    {
+      "entropy": 0.34607914835214615,
+      "epoch": 0.39203675344563554,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.00016219512195121954,
+      "loss": 0.0862056165933609,
+      "mean_token_accuracy": 0.975422702729702,
+      "num_tokens": 227923.0,
+      "step": 32
+    },
+    {
+      "entropy": 0.3694024868309498,
+      "epoch": 0.40428790199081166,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.00016097560975609758,
+      "loss": 0.07349187880754471,
+      "mean_token_accuracy": 0.9769446365535259,
+      "num_tokens": 234412.0,
+      "step": 33
+    },
+    {
+      "entropy": 0.3105186866596341,
+      "epoch": 0.4165390505359878,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00015975609756097562,
+      "loss": 0.092418372631073,
+      "mean_token_accuracy": 0.9768914133310318,
+      "num_tokens": 241587.0,
+      "step": 34
+    },
+    {
+      "entropy": 0.29616252705454826,
+      "epoch": 0.42879019908116384,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.00015853658536585366,
+      "loss": 0.06796497106552124,
+      "mean_token_accuracy": 0.9777620621025562,
+      "num_tokens": 249133.0,
+      "step": 35
+    },
+    {
+      "entropy": 0.32076778169721365,
+      "epoch": 0.44104134762633995,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.00015731707317073173,
+      "loss": 0.06996235251426697,
+      "mean_token_accuracy": 0.9789225980639458,
+      "num_tokens": 255805.0,
+      "step": 36
+    },
+    {
+      "entropy": 0.277026129886508,
+      "epoch": 0.45329249617151607,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.00015609756097560978,
+      "loss": 0.06752780079841614,
+      "mean_token_accuracy": 0.9788497537374496,
+      "num_tokens": 262962.0,
+      "step": 37
+    },
+    {
+      "entropy": 0.3057912113144994,
+      "epoch": 0.4655436447166922,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00015487804878048782,
+      "loss": 0.07604770362377167,
+      "mean_token_accuracy": 0.9693816341459751,
+      "num_tokens": 268812.0,
+      "step": 38
+    },
+    {
+      "entropy": 0.2962911752983928,
+      "epoch": 0.4777947932618683,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00015365853658536586,
+      "loss": 0.08501230180263519,
+      "mean_token_accuracy": 0.969777375459671,
+      "num_tokens": 275542.0,
+      "step": 39
+    },
+    {
+      "entropy": 0.29683656617999077,
+      "epoch": 0.4900459418070444,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001524390243902439,
+      "loss": 0.07492107152938843,
+      "mean_token_accuracy": 0.9751906730234623,
+      "num_tokens": 282460.0,
+      "step": 40
+    },
+    {
+      "entropy": 0.29800107702612877,
+      "epoch": 0.5022970903522205,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00015121951219512197,
+      "loss": 0.07613345235586166,
+      "mean_token_accuracy": 0.9772711955010891,
+      "num_tokens": 287662.0,
+      "step": 41
+    },
+    {
+      "entropy": 0.287130375392735,
+      "epoch": 0.5145482388973966,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.07252758741378784,
+      "mean_token_accuracy": 0.9758713953197002,
+      "num_tokens": 294018.0,
+      "step": 42
+    },
+    {
+      "entropy": 0.31403718050569296,
+      "epoch": 0.5267993874425727,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.00014878048780487806,
+      "loss": 0.07692913711071014,
+      "mean_token_accuracy": 0.9735661074519157,
+      "num_tokens": 301127.0,
+      "step": 43
+    },
+    {
+      "entropy": 0.32532981038093567,
+      "epoch": 0.5390505359877489,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001475609756097561,
+      "loss": 0.06646531820297241,
+      "mean_token_accuracy": 0.9817029014229774,
+      "num_tokens": 307985.0,
+      "step": 44
+    },
+    {
+      "entropy": 0.29122561775147915,
+      "epoch": 0.5513016845329249,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.00014634146341463414,
+      "loss": 0.08177122473716736,
+      "mean_token_accuracy": 0.9714771695435047,
+      "num_tokens": 314911.0,
+      "step": 45
+    },
+    {
+      "entropy": 0.31051622424274683,
+      "epoch": 0.5635528330781011,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001451219512195122,
+      "loss": 0.08793994784355164,
+      "mean_token_accuracy": 0.9757628589868546,
+      "num_tokens": 321376.0,
+      "step": 46
+    },
+    {
+      "entropy": 0.31927434727549553,
+      "epoch": 0.5758039816232772,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00014390243902439025,
+      "loss": 0.07368139922618866,
+      "mean_token_accuracy": 0.9764612726867199,
+      "num_tokens": 327751.0,
+      "step": 47
+    },
+    {
+      "entropy": 0.31811847630888224,
+      "epoch": 0.5880551301684533,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001426829268292683,
+      "loss": 0.07313578575849533,
+      "mean_token_accuracy": 0.9781395755708218,
+      "num_tokens": 333164.0,
+      "step": 48
+    },
+    {
+      "entropy": 0.33736035507172346,
+      "epoch": 0.6003062787136294,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.00014146341463414634,
+      "loss": 0.06882140785455704,
+      "mean_token_accuracy": 0.9774826094508171,
+      "num_tokens": 339755.0,
+      "step": 49
+    },
+    {
+      "entropy": 0.31375813484191895,
+      "epoch": 0.6125574272588055,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.00014024390243902438,
+      "loss": 0.07694698125123978,
+      "mean_token_accuracy": 0.9765506535768509,
+      "num_tokens": 346465.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.6125574272588055,
+      "eval_entropy": 0.3228047593780186,
+      "eval_loss": 0.06397496163845062,
+      "eval_mean_token_accuracy": 0.9778102247611337,
+      "eval_num_tokens": 346465.0,
+      "eval_runtime": 61.6139,
+      "eval_samples_per_second": 1.12,
+      "eval_steps_per_second": 1.12,
+      "step": 50
+    },
+    {
+      "entropy": 0.33666181843727827,
+      "epoch": 0.6248085758039816,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00013902439024390245,
+      "loss": 0.07857983559370041,
+      "mean_token_accuracy": 0.9755821079015732,
+      "num_tokens": 352546.0,
+      "step": 51
+    },
+    {
+      "entropy": 0.3325382797047496,
+      "epoch": 0.6370597243491577,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001378048780487805,
+      "loss": 0.055708881467580795,
+      "mean_token_accuracy": 0.9814197011291981,
+      "num_tokens": 358950.0,
+      "step": 52
+    },
+    {
+      "entropy": 0.32812063954770565,
+      "epoch": 0.6493108728943339,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.00013658536585365856,
+      "loss": 0.07367786020040512,
+      "mean_token_accuracy": 0.9777578823268414,
+      "num_tokens": 366288.0,
+      "step": 53
+    },
+    {
+      "entropy": 0.2817836347967386,
+      "epoch": 0.6615620214395099,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001353658536585366,
+      "loss": 0.0578315332531929,
+      "mean_token_accuracy": 0.9810350127518177,
+      "num_tokens": 372898.0,
+      "step": 54
+    },
+    {
+      "entropy": 0.32251426950097084,
+      "epoch": 0.6738131699846861,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.00013414634146341464,
+      "loss": 0.06909041851758957,
+      "mean_token_accuracy": 0.9752591475844383,
+      "num_tokens": 380158.0,
+      "step": 55
+    },
+    {
+      "entropy": 0.32180769462138414,
+      "epoch": 0.6860643185298622,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001329268292682927,
+      "loss": 0.07875123620033264,
+      "mean_token_accuracy": 0.9738615117967129,
+      "num_tokens": 386269.0,
+      "step": 56
+    },
+    {
+      "entropy": 0.307281319051981,
+      "epoch": 0.6983154670750383,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00013170731707317076,
+      "loss": 0.07773981243371964,
+      "mean_token_accuracy": 0.9736250452697277,
+      "num_tokens": 394382.0,
+      "step": 57
+    },
+    {
+      "entropy": 0.29658956825733185,
+      "epoch": 0.7105666156202144,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001304878048780488,
+      "loss": 0.05800808221101761,
+      "mean_token_accuracy": 0.9793435782194138,
+      "num_tokens": 401510.0,
+      "step": 58
+    },
+    {
+      "entropy": 0.34156549349427223,
+      "epoch": 0.7228177641653905,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.00012926829268292684,
+      "loss": 0.07996754348278046,
+      "mean_token_accuracy": 0.9773447252810001,
+      "num_tokens": 407257.0,
+      "step": 59
+    },
+    {
+      "entropy": 0.3153263973072171,
+      "epoch": 0.7350689127105666,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.00012804878048780488,
+      "loss": 0.07675991952419281,
+      "mean_token_accuracy": 0.9762718975543976,
+      "num_tokens": 413752.0,
+      "step": 60
+    },
+    {
+      "entropy": 0.28962062764912844,
+      "epoch": 0.7473200612557427,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.00012682926829268293,
+      "loss": 0.08181046694517136,
+      "mean_token_accuracy": 0.9704542681574821,
+      "num_tokens": 421184.0,
+      "step": 61
+    },
+    {
+      "entropy": 0.3103124424815178,
+      "epoch": 0.7595712098009189,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.000125609756097561,
+      "loss": 0.07483275234699249,
+      "mean_token_accuracy": 0.9754621051251888,
+      "num_tokens": 428988.0,
+      "step": 62
+    },
+    {
+      "entropy": 0.27847875375300646,
+      "epoch": 0.7718223583460949,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.00012439024390243904,
+      "loss": 0.06930931657552719,
+      "mean_token_accuracy": 0.9753326959908009,
+      "num_tokens": 436040.0,
+      "step": 63
+    },
+    {
+      "entropy": 0.2668089345097542,
+      "epoch": 0.7840735068912711,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.00012317073170731708,
+      "loss": 0.06888818740844727,
+      "mean_token_accuracy": 0.9757619500160217,
+      "num_tokens": 442659.0,
+      "step": 64
+    },
+    {
+      "entropy": 0.3198395315557718,
+      "epoch": 0.7963246554364471,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.00012195121951219512,
+      "loss": 0.05557462200522423,
+      "mean_token_accuracy": 0.984383974224329,
+      "num_tokens": 449165.0,
+      "step": 65
+    },
+    {
+      "entropy": 0.30934575013816357,
+      "epoch": 0.8085758039816233,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00012073170731707318,
+      "loss": 0.06038631126284599,
+      "mean_token_accuracy": 0.98052117228508,
+      "num_tokens": 456668.0,
+      "step": 66
+    },
+    {
+      "entropy": 0.30664732959121466,
+      "epoch": 0.8208269525267994,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.00011951219512195122,
+      "loss": 0.06068733334541321,
+      "mean_token_accuracy": 0.9787427373230457,
+      "num_tokens": 463055.0,
+      "step": 67
+    },
+    {
+      "entropy": 0.32763790991157293,
+      "epoch": 0.8330781010719756,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.00011829268292682926,
+      "loss": 0.06245455890893936,
+      "mean_token_accuracy": 0.977917030453682,
+      "num_tokens": 469027.0,
+      "step": 68
+    },
+    {
+      "entropy": 0.30262293945997953,
+      "epoch": 0.8453292496171516,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00011707317073170732,
+      "loss": 0.08869794756174088,
+      "mean_token_accuracy": 0.9743811637163162,
+      "num_tokens": 474692.0,
+      "step": 69
+    },
+    {
+      "entropy": 0.3196303751319647,
+      "epoch": 0.8575803981623277,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00011585365853658536,
+      "loss": 0.09405583888292313,
+      "mean_token_accuracy": 0.9726175181567669,
+      "num_tokens": 480966.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.35709446109831333,
+      "epoch": 0.8698315467075038,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00011463414634146342,
+      "loss": 0.06765145808458328,
+      "mean_token_accuracy": 0.9785624854266644,
+      "num_tokens": 487243.0,
+      "step": 71
+    },
+    {
+      "entropy": 0.3187448363751173,
+      "epoch": 0.8820826952526799,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00011341463414634146,
+      "loss": 0.11273079365491867,
+      "mean_token_accuracy": 0.968480296432972,
+      "num_tokens": 493644.0,
+      "step": 72
+    },
+    {
+      "entropy": 0.338912196457386,
+      "epoch": 0.8943338437978561,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00011219512195121953,
+      "loss": 0.07907245308160782,
+      "mean_token_accuracy": 0.9740516655147076,
+      "num_tokens": 501266.0,
+      "step": 73
+    },
+    {
+      "entropy": 0.32440096139907837,
+      "epoch": 0.9065849923430321,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.00011097560975609757,
+      "loss": 0.05401031672954559,
+      "mean_token_accuracy": 0.9804942272603512,
+      "num_tokens": 508602.0,
+      "step": 74
+    },
+    {
+      "entropy": 0.3483101995661855,
+      "epoch": 0.9188361408882083,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00010975609756097563,
+      "loss": 0.11057186871767044,
+      "mean_token_accuracy": 0.9718486294150352,
+      "num_tokens": 515398.0,
+      "step": 75
+    },
+    {
+      "entropy": 0.34578234050422907,
+      "epoch": 0.9310872894333844,
+      "grad_norm": 0.125,
+      "learning_rate": 0.00010853658536585367,
+      "loss": 0.06668201088905334,
+      "mean_token_accuracy": 0.9788769222795963,
+      "num_tokens": 521843.0,
+      "step": 76
+    },
+    {
+      "entropy": 0.30221153143793344,
+      "epoch": 0.9433384379785605,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.00010731707317073172,
+      "loss": 0.06314530223608017,
+      "mean_token_accuracy": 0.9795791730284691,
+      "num_tokens": 529048.0,
+      "step": 77
+    },
+    {
+      "entropy": 0.3246979024261236,
+      "epoch": 0.9555895865237366,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.00010609756097560977,
+      "loss": 0.06752847880125046,
+      "mean_token_accuracy": 0.9832794591784477,
+      "num_tokens": 536898.0,
+      "step": 78
+    },
+    {
+      "entropy": 0.31460305489599705,
+      "epoch": 0.9678407350689127,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00010487804878048781,
+      "loss": 0.07550302147865295,
+      "mean_token_accuracy": 0.9745542369782925,
+      "num_tokens": 543183.0,
+      "step": 79
+    },
+    {
+      "entropy": 0.29604928102344275,
+      "epoch": 0.9800918836140888,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00010365853658536586,
+      "loss": 0.061620082706213,
+      "mean_token_accuracy": 0.9814895763993263,
+      "num_tokens": 550826.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.309127070941031,
+      "epoch": 0.9923430321592649,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001024390243902439,
+      "loss": 0.08857648074626923,
+      "mean_token_accuracy": 0.9735055603086948,
+      "num_tokens": 557586.0,
+      "step": 81
+    },
+    {
+      "entropy": 0.28887457996606825,
+      "epoch": 1.0,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00010121951219512196,
+      "loss": 0.06288491934537888,
+      "mean_token_accuracy": 0.9819431960582733,
+      "num_tokens": 561550.0,
+      "step": 82
+    },
+    {
+      "entropy": 0.3049680180847645,
+      "epoch": 1.0122511485451762,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.08012041449546814,
+      "mean_token_accuracy": 0.9773442409932613,
+      "num_tokens": 568728.0,
+      "step": 83
+    },
+    {
+      "entropy": 0.26021771878004074,
+      "epoch": 1.0245022970903521,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 9.878048780487805e-05,
+      "loss": 0.04987334832549095,
+      "mean_token_accuracy": 0.9872907213866711,
+      "num_tokens": 575388.0,
+      "step": 84
+    },
+    {
+      "entropy": 0.3132081003859639,
+      "epoch": 1.0367534456355283,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 9.75609756097561e-05,
+      "loss": 0.03793306648731232,
+      "mean_token_accuracy": 0.9893307648599148,
+      "num_tokens": 583555.0,
+      "step": 85
+    },
+    {
+      "entropy": 0.2535849278792739,
+      "epoch": 1.0490045941807045,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 9.634146341463415e-05,
+      "loss": 0.04893108084797859,
+      "mean_token_accuracy": 0.9859805703163147,
+      "num_tokens": 589967.0,
+      "step": 86
+    },
+    {
+      "entropy": 0.23509769327938557,
+      "epoch": 1.0612557427258806,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 9.51219512195122e-05,
+      "loss": 0.0479637086391449,
+      "mean_token_accuracy": 0.9847861491143703,
+      "num_tokens": 598042.0,
+      "step": 87
+    },
+    {
+      "entropy": 0.30080515146255493,
+      "epoch": 1.0735068912710566,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 9.390243902439024e-05,
+      "loss": 0.08284764736890793,
+      "mean_token_accuracy": 0.9821349158883095,
+      "num_tokens": 604464.0,
+      "step": 88
+    },
+    {
+      "entropy": 0.2792583117261529,
+      "epoch": 1.0857580398162328,
+      "grad_norm": 0.099609375,
+      "learning_rate": 9.26829268292683e-05,
+      "loss": 0.04615325853228569,
+      "mean_token_accuracy": 0.9861109368503094,
+      "num_tokens": 610667.0,
+      "step": 89
+    },
+    {
+      "entropy": 0.2520109824836254,
+      "epoch": 1.098009188361409,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 9.146341463414635e-05,
+      "loss": 0.051208432763814926,
+      "mean_token_accuracy": 0.9884813874959946,
+      "num_tokens": 617684.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.2811925411224365,
+      "epoch": 1.110260336906585,
+      "grad_norm": 0.10546875,
+      "learning_rate": 9.02439024390244e-05,
+      "loss": 0.045838613063097,
+      "mean_token_accuracy": 0.9813198745250702,
+      "num_tokens": 624400.0,
+      "step": 91
+    },
+    {
+      "entropy": 0.24536922946572304,
+      "epoch": 1.122511485451761,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 8.902439024390244e-05,
+      "loss": 0.04839828237891197,
+      "mean_token_accuracy": 0.9828252196311951,
+      "num_tokens": 631186.0,
+      "step": 92
+    },
+    {
+      "entropy": 0.22986232955008745,
+      "epoch": 1.1347626339969372,
+      "grad_norm": 0.111328125,
+      "learning_rate": 8.78048780487805e-05,
+      "loss": 0.04496093466877937,
+      "mean_token_accuracy": 0.9868007227778435,
+      "num_tokens": 638230.0,
+      "step": 93
+    },
+    {
+      "entropy": 0.24173379130661488,
+      "epoch": 1.1470137825421134,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 8.658536585365854e-05,
+      "loss": 0.038931019604206085,
+      "mean_token_accuracy": 0.9905626736581326,
+      "num_tokens": 644599.0,
+      "step": 94
+    },
+    {
+      "entropy": 0.22757203690707684,
+      "epoch": 1.1592649310872893,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 8.53658536585366e-05,
+      "loss": 0.03816759213805199,
+      "mean_token_accuracy": 0.9886907860636711,
+      "num_tokens": 651211.0,
+      "step": 95
+    },
+    {
+      "entropy": 0.22881112340837717,
+      "epoch": 1.1715160796324655,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 8.414634146341464e-05,
+      "loss": 0.0442253053188324,
+      "mean_token_accuracy": 0.9876982606947422,
+      "num_tokens": 657240.0,
+      "step": 96
+    },
+    {
+      "entropy": 0.23342516459524632,
+      "epoch": 1.1837672281776417,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 8.292682926829268e-05,
+      "loss": 0.0382809042930603,
+      "mean_token_accuracy": 0.9902381077408791,
+      "num_tokens": 663894.0,
+      "step": 97
+    },
+    {
+      "entropy": 0.2406658846884966,
+      "epoch": 1.1960183767228179,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 8.170731707317073e-05,
+      "loss": 0.04963719844818115,
+      "mean_token_accuracy": 0.9877509213984013,
+      "num_tokens": 671338.0,
+      "step": 98
+    },
+    {
+      "entropy": 0.272550736553967,
+      "epoch": 1.2082695252679938,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 8.048780487804879e-05,
+      "loss": 0.05441854149103165,
+      "mean_token_accuracy": 0.983638808131218,
+      "num_tokens": 679108.0,
+      "step": 99
+    },
+    {
+      "entropy": 0.2297075460664928,
+      "epoch": 1.22052067381317,
+      "grad_norm": 0.142578125,
+      "learning_rate": 7.926829268292683e-05,
+      "loss": 0.04290780425071716,
+      "mean_token_accuracy": 0.9863294512033463,
+      "num_tokens": 685572.0,
+      "step": 100
+    },
+    {
+      "epoch": 1.22052067381317,
+      "eval_entropy": 0.2541272067937298,
+      "eval_loss": 0.06591593474149704,
+      "eval_mean_token_accuracy": 0.9771205471909564,
+      "eval_num_tokens": 685572.0,
+      "eval_runtime": 61.7088,
+      "eval_samples_per_second": 1.118,
+      "eval_steps_per_second": 1.118,
+      "step": 100
+    },
+    {
+      "entropy": 0.25287739746272564,
+      "epoch": 1.2327718223583461,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 7.804878048780489e-05,
+      "loss": 0.03635338693857193,
+      "mean_token_accuracy": 0.9891670718789101,
+      "num_tokens": 692888.0,
+      "step": 101
+    },
+    {
+      "entropy": 0.23323721811175346,
+      "epoch": 1.245022970903522,
+      "grad_norm": 0.091796875,
+      "learning_rate": 7.682926829268293e-05,
+      "loss": 0.03589210659265518,
+      "mean_token_accuracy": 0.9888128153979778,
+      "num_tokens": 700015.0,
+      "step": 102
+    },
+    {
+      "entropy": 0.24865772109478712,
+      "epoch": 1.2572741194486983,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 7.560975609756099e-05,
+      "loss": 0.0306346844881773,
+      "mean_token_accuracy": 0.9904107972979546,
+      "num_tokens": 707330.0,
+      "step": 103
+    },
+    {
+      "entropy": 0.2551306625828147,
+      "epoch": 1.2695252679938744,
+      "grad_norm": 0.134765625,
+      "learning_rate": 7.439024390243903e-05,
+      "loss": 0.062116604298353195,
+      "mean_token_accuracy": 0.9804923608899117,
+      "num_tokens": 714185.0,
+      "step": 104
+    },
+    {
+      "entropy": 0.26964253932237625,
+      "epoch": 1.2817764165390506,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 7.317073170731707e-05,
+      "loss": 0.05495516210794449,
+      "mean_token_accuracy": 0.983124740421772,
+      "num_tokens": 720030.0,
+      "step": 105
+    },
+    {
+      "entropy": 0.2399951433762908,
+      "epoch": 1.2940275650842268,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 7.195121951219513e-05,
+      "loss": 0.0537109449505806,
+      "mean_token_accuracy": 0.9835550487041473,
+      "num_tokens": 727399.0,
+      "step": 106
+    },
+    {
+      "entropy": 0.2527354145422578,
+      "epoch": 1.3062787136294027,
+      "grad_norm": 0.095703125,
+      "learning_rate": 7.073170731707317e-05,
+      "loss": 0.0370502732694149,
+      "mean_token_accuracy": 0.987937405705452,
+      "num_tokens": 734117.0,
+      "step": 107
+    },
+    {
+      "entropy": 0.2708861446008086,
+      "epoch": 1.318529862174579,
+      "grad_norm": 0.1328125,
+      "learning_rate": 6.951219512195122e-05,
+      "loss": 0.02873871847987175,
+      "mean_token_accuracy": 0.9898889176547527,
+      "num_tokens": 739843.0,
+      "step": 108
+    },
+    {
+      "entropy": 0.24508011247962713,
+      "epoch": 1.3307810107197549,
+      "grad_norm": 0.142578125,
+      "learning_rate": 6.829268292682928e-05,
+      "loss": 0.04992092400789261,
+      "mean_token_accuracy": 0.985265351831913,
+      "num_tokens": 747120.0,
+      "step": 109
+    },
+    {
+      "entropy": 0.24371083825826645,
+      "epoch": 1.343032159264931,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 6.707317073170732e-05,
+      "loss": 0.04801744595170021,
+      "mean_token_accuracy": 0.9857638366520405,
+      "num_tokens": 754708.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.24870567955076694,
+      "epoch": 1.3552833078101072,
+      "grad_norm": 0.12109375,
+      "learning_rate": 6.585365853658538e-05,
+      "loss": 0.04191367328166962,
+      "mean_token_accuracy": 0.9862709790468216,
+      "num_tokens": 760898.0,
+      "step": 111
+    },
+    {
+      "entropy": 0.2612510984763503,
+      "epoch": 1.3675344563552834,
+      "grad_norm": 0.103515625,
+      "learning_rate": 6.463414634146342e-05,
+      "loss": 0.05326298251748085,
+      "mean_token_accuracy": 0.9800548776984215,
+      "num_tokens": 767451.0,
+      "step": 112
+    },
+    {
+      "entropy": 0.2545614130795002,
+      "epoch": 1.3797856049004595,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 6.341463414634146e-05,
+      "loss": 0.040731754153966904,
+      "mean_token_accuracy": 0.9870966114103794,
+      "num_tokens": 775160.0,
+      "step": 113
+    },
+    {
+      "entropy": 0.24170197267085314,
+      "epoch": 1.3920367534456355,
+      "grad_norm": 0.1328125,
+      "learning_rate": 6.219512195121952e-05,
+      "loss": 0.040866196155548096,
+      "mean_token_accuracy": 0.9861873909831047,
+      "num_tokens": 782453.0,
+      "step": 114
+    },
+    {
+      "entropy": 0.26318133249878883,
+      "epoch": 1.4042879019908117,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 6.097560975609756e-05,
+      "loss": 0.03041079081594944,
+      "mean_token_accuracy": 0.989543404430151,
+      "num_tokens": 789405.0,
+      "step": 115
+    },
+    {
+      "entropy": 0.2495063878595829,
+      "epoch": 1.4165390505359878,
+      "grad_norm": 0.12109375,
+      "learning_rate": 5.975609756097561e-05,
+      "loss": 0.03743698075413704,
+      "mean_token_accuracy": 0.9890936017036438,
+      "num_tokens": 795520.0,
+      "step": 116
+    },
+    {
+      "entropy": 0.23341670911759138,
+      "epoch": 1.4287901990811638,
+      "grad_norm": 0.11328125,
+      "learning_rate": 5.853658536585366e-05,
+      "loss": 0.052408114075660706,
+      "mean_token_accuracy": 0.9838856235146523,
+      "num_tokens": 802060.0,
+      "step": 117
+    },
+    {
+      "entropy": 0.2609324613586068,
+      "epoch": 1.44104134762634,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 5.731707317073171e-05,
+      "loss": 0.04312039911746979,
+      "mean_token_accuracy": 0.985971175134182,
+      "num_tokens": 808902.0,
+      "step": 118
+    },
+    {
+      "entropy": 0.2670856877230108,
+      "epoch": 1.4532924961715161,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 5.6097560975609764e-05,
+      "loss": 0.045985687524080276,
+      "mean_token_accuracy": 0.9871717020869255,
+      "num_tokens": 816206.0,
+      "step": 119
+    },
+    {
+      "entropy": 0.26661152858287096,
+      "epoch": 1.4655436447166923,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 5.487804878048781e-05,
+      "loss": 0.04714817553758621,
+      "mean_token_accuracy": 0.9872265867888927,
+      "num_tokens": 822262.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.23576788790524006,
+      "epoch": 1.4777947932618682,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 5.365853658536586e-05,
+      "loss": 0.042331770062446594,
+      "mean_token_accuracy": 0.9874267354607582,
+      "num_tokens": 828344.0,
+      "step": 121
+    },
+    {
+      "entropy": 0.23678721394389868,
+      "epoch": 1.4900459418070444,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 5.2439024390243904e-05,
+      "loss": 0.04589102789759636,
+      "mean_token_accuracy": 0.983386930078268,
+      "num_tokens": 835172.0,
+      "step": 122
+    },
+    {
+      "entropy": 0.2346462979912758,
+      "epoch": 1.5022970903522204,
+      "grad_norm": 0.150390625,
+      "learning_rate": 5.121951219512195e-05,
+      "loss": 0.04002859443426132,
+      "mean_token_accuracy": 0.9853599444031715,
+      "num_tokens": 841225.0,
+      "step": 123
+    },
+    {
+      "entropy": 0.23265949822962284,
+      "epoch": 1.5145482388973965,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 5e-05,
+      "loss": 0.05799969285726547,
+      "mean_token_accuracy": 0.9821320809423923,
+      "num_tokens": 849059.0,
+      "step": 124
+    },
+    {
+      "entropy": 0.24194937106221914,
+      "epoch": 1.5267993874425727,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 4.878048780487805e-05,
+      "loss": 0.05711432918906212,
+      "mean_token_accuracy": 0.9829593598842621,
+      "num_tokens": 857164.0,
+      "step": 125
+    },
+    {
+      "entropy": 0.23494063317775726,
+      "epoch": 1.5390505359877489,
+      "grad_norm": 0.13671875,
+      "learning_rate": 4.75609756097561e-05,
+      "loss": 0.06452606618404388,
+      "mean_token_accuracy": 0.9841964319348335,
+      "num_tokens": 864463.0,
+      "step": 126
+    },
+    {
+      "entropy": 0.2896596472710371,
+      "epoch": 1.551301684532925,
+      "grad_norm": 0.125,
+      "learning_rate": 4.634146341463415e-05,
+      "loss": 0.05577927827835083,
+      "mean_token_accuracy": 0.9853286854922771,
+      "num_tokens": 871759.0,
+      "step": 127
+    },
+    {
+      "entropy": 0.2581626093015075,
+      "epoch": 1.5635528330781012,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 4.51219512195122e-05,
+      "loss": 0.03707893565297127,
+      "mean_token_accuracy": 0.9873116910457611,
+      "num_tokens": 878778.0,
+      "step": 128
+    },
+    {
+      "entropy": 0.22061639744788408,
+      "epoch": 1.5758039816232772,
+      "grad_norm": 0.130859375,
+      "learning_rate": 4.390243902439025e-05,
+      "loss": 0.03345699608325958,
+      "mean_token_accuracy": 0.9894634075462818,
+      "num_tokens": 884824.0,
+      "step": 129
+    },
+    {
+      "entropy": 0.22001465130597353,
+      "epoch": 1.5880551301684533,
+      "grad_norm": 0.115234375,
+      "learning_rate": 4.26829268292683e-05,
+      "loss": 0.033830057829618454,
+      "mean_token_accuracy": 0.9893862381577492,
+      "num_tokens": 891884.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.24686648417264223,
+      "epoch": 1.6003062787136293,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 4.146341463414634e-05,
+      "loss": 0.03980773314833641,
+      "mean_token_accuracy": 0.9880322627723217,
+      "num_tokens": 898482.0,
+      "step": 131
+    },
+    {
+      "entropy": 0.24617432616651058,
+      "epoch": 1.6125574272588055,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 4.0243902439024395e-05,
+      "loss": 0.05706659331917763,
+      "mean_token_accuracy": 0.9778756387531757,
+      "num_tokens": 905832.0,
+      "step": 132
+    },
+    {
+      "entropy": 0.23000300489366055,
+      "epoch": 1.6248085758039816,
+      "grad_norm": 0.11328125,
+      "learning_rate": 3.9024390243902444e-05,
+      "loss": 0.04501105844974518,
+      "mean_token_accuracy": 0.9868872575461864,
+      "num_tokens": 912020.0,
+      "step": 133
+    },
+    {
+      "entropy": 0.23596840538084507,
+      "epoch": 1.6370597243491578,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 3.780487804878049e-05,
+      "loss": 0.057706866413354874,
+      "mean_token_accuracy": 0.9847077578306198,
+      "num_tokens": 918405.0,
+      "step": 134
+    },
+    {
+      "entropy": 0.25483656115829945,
+      "epoch": 1.649310872894334,
+      "grad_norm": 0.13671875,
+      "learning_rate": 3.6585365853658535e-05,
+      "loss": 0.06121227145195007,
+      "mean_token_accuracy": 0.9820427037775517,
+      "num_tokens": 925425.0,
+      "step": 135
+    },
+    {
+      "entropy": 0.2879916010424495,
+      "epoch": 1.66156202143951,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 3.5365853658536584e-05,
+      "loss": 0.03920814394950867,
+      "mean_token_accuracy": 0.9877970777451992,
+      "num_tokens": 930895.0,
+      "step": 136
+    },
+    {
+      "entropy": 0.2634068289771676,
+      "epoch": 1.673813169984686,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 3.414634146341464e-05,
+      "loss": 0.0386468879878521,
+      "mean_token_accuracy": 0.9877878725528717,
+      "num_tokens": 937389.0,
+      "step": 137
+    },
+    {
+      "entropy": 0.2587460493668914,
+      "epoch": 1.686064318529862,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 3.292682926829269e-05,
+      "loss": 0.034930720925331116,
+      "mean_token_accuracy": 0.9874115809798241,
+      "num_tokens": 945016.0,
+      "step": 138
+    },
+    {
+      "entropy": 0.26116301491856575,
+      "epoch": 1.6983154670750382,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 3.170731707317073e-05,
+      "loss": 0.033916398882865906,
+      "mean_token_accuracy": 0.9879581108689308,
+      "num_tokens": 951475.0,
+      "step": 139
+    },
+    {
+      "entropy": 0.2920989394187927,
+      "epoch": 1.7105666156202144,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 3.048780487804878e-05,
+      "loss": 0.03983481973409653,
+      "mean_token_accuracy": 0.9893253818154335,
+      "num_tokens": 957845.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.23128231521695852,
+      "epoch": 1.7228177641653906,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 2.926829268292683e-05,
+      "loss": 0.051980044692754745,
+      "mean_token_accuracy": 0.9810708239674568,
+      "num_tokens": 964800.0,
+      "step": 141
+    },
+    {
+      "entropy": 0.25590753462165594,
+      "epoch": 1.7350689127105667,
+      "grad_norm": 0.171875,
+      "learning_rate": 2.8048780487804882e-05,
+      "loss": 0.04797530174255371,
+      "mean_token_accuracy": 0.98531124740839,
+      "num_tokens": 970216.0,
+      "step": 142
+    },
+    {
+      "entropy": 0.24975391291081905,
+      "epoch": 1.7473200612557427,
+      "grad_norm": 0.125,
+      "learning_rate": 2.682926829268293e-05,
+      "loss": 0.04290381073951721,
+      "mean_token_accuracy": 0.9855599775910378,
+      "num_tokens": 976676.0,
+      "step": 143
+    },
+    {
+      "entropy": 0.2620975775644183,
+      "epoch": 1.7595712098009189,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 2.5609756097560977e-05,
+      "loss": 0.04051404073834419,
+      "mean_token_accuracy": 0.9880265817046165,
+      "num_tokens": 984515.0,
+      "step": 144
+    },
+    {
+      "entropy": 0.2606674963608384,
+      "epoch": 1.7718223583460948,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 2.4390243902439026e-05,
+      "loss": 0.040870461612939835,
+      "mean_token_accuracy": 0.9857275933027267,
+      "num_tokens": 991262.0,
+      "step": 145
+    },
+    {
+      "entropy": 0.28423305694013834,
+      "epoch": 1.784073506891271,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 2.3170731707317075e-05,
+      "loss": 0.033665239810943604,
+      "mean_token_accuracy": 0.9889882653951645,
+      "num_tokens": 997598.0,
+      "step": 146
+    },
+    {
+      "entropy": 0.2581148808822036,
+      "epoch": 1.7963246554364471,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 2.1951219512195124e-05,
+      "loss": 0.04215285927057266,
+      "mean_token_accuracy": 0.985553503036499,
+      "num_tokens": 1005517.0,
+      "step": 147
+    },
+    {
+      "entropy": 0.27665852196514606,
+      "epoch": 1.8085758039816233,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 2.073170731707317e-05,
+      "loss": 0.036060549318790436,
+      "mean_token_accuracy": 0.9891492277383804,
+      "num_tokens": 1012208.0,
+      "step": 148
+    },
+    {
+      "entropy": 0.2916540242731571,
+      "epoch": 1.8208269525267995,
+      "grad_norm": 0.111328125,
+      "learning_rate": 1.9512195121951222e-05,
+      "loss": 0.03392140194773674,
+      "mean_token_accuracy": 0.9888332188129425,
+      "num_tokens": 1018126.0,
+      "step": 149
+    },
+    {
+      "entropy": 0.2635590499266982,
+      "epoch": 1.8330781010719757,
+      "grad_norm": 0.107421875,
+      "learning_rate": 1.8292682926829268e-05,
+      "loss": 0.03831010311841965,
+      "mean_token_accuracy": 0.9885501191020012,
+      "num_tokens": 1025618.0,
+      "step": 150
+    },
+    {
+      "epoch": 1.8330781010719757,
+      "eval_entropy": 0.2679487613664157,
+      "eval_loss": 0.06371253728866577,
+      "eval_mean_token_accuracy": 0.9774324289266614,
+      "eval_num_tokens": 1025618.0,
+      "eval_runtime": 61.7245,
+      "eval_samples_per_second": 1.118,
+      "eval_steps_per_second": 1.118,
+      "step": 150
+    },
+    {
+      "entropy": 0.24828598741441965,
+      "epoch": 1.8453292496171516,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 1.707317073170732e-05,
+      "loss": 0.04995449632406235,
+      "mean_token_accuracy": 0.9845831654965878,
+      "num_tokens": 1031490.0,
+      "step": 151
+    },
+    {
+      "entropy": 0.2742726355791092,
+      "epoch": 1.8575803981623276,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 1.5853658536585366e-05,
+      "loss": 0.03595530614256859,
+      "mean_token_accuracy": 0.9909501671791077,
+      "num_tokens": 1039322.0,
+      "step": 152
+    },
+    {
+      "entropy": 0.262394018471241,
+      "epoch": 1.8698315467075037,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 1.4634146341463415e-05,
+      "loss": 0.04870224744081497,
+      "mean_token_accuracy": 0.981577530503273,
+      "num_tokens": 1047516.0,
+      "step": 153
+    },
+    {
+      "entropy": 0.2567757312208414,
+      "epoch": 1.88208269525268,
+      "grad_norm": 0.130859375,
+      "learning_rate": 1.3414634146341466e-05,
+      "loss": 0.05070454254746437,
+      "mean_token_accuracy": 0.9822338744997978,
+      "num_tokens": 1055758.0,
+      "step": 154
+    },
+    {
+      "entropy": 0.27977011259645224,
+      "epoch": 1.894333843797856,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 1.2195121951219513e-05,
+      "loss": 0.048383209854364395,
+      "mean_token_accuracy": 0.9842112176120281,
+      "num_tokens": 1062986.0,
+      "step": 155
+    },
+    {
+      "entropy": 0.27395711559802294,
+      "epoch": 1.9065849923430322,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 1.0975609756097562e-05,
+      "loss": 0.03616052120923996,
+      "mean_token_accuracy": 0.9874880388379097,
+      "num_tokens": 1069566.0,
+      "step": 156
+    },
+    {
+      "entropy": 0.2608653483912349,
+      "epoch": 1.9188361408882084,
+      "grad_norm": 0.11328125,
+      "learning_rate": 9.756097560975611e-06,
+      "loss": 0.0475415475666523,
+      "mean_token_accuracy": 0.9874128215014935,
+      "num_tokens": 1077129.0,
+      "step": 157
+    },
+    {
+      "entropy": 0.2818295741453767,
+      "epoch": 1.9310872894333844,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 8.53658536585366e-06,
+      "loss": 0.03717351332306862,
+      "mean_token_accuracy": 0.9891585931181908,
+      "num_tokens": 1083892.0,
+      "step": 158
+    },
+    {
+      "entropy": 0.25744000263512135,
+      "epoch": 1.9433384379785605,
+      "grad_norm": 0.091796875,
+      "learning_rate": 7.317073170731707e-06,
+      "loss": 0.033981338143348694,
+      "mean_token_accuracy": 0.9918354228138924,
+      "num_tokens": 1089984.0,
+      "step": 159
+    },
+    {
+      "entropy": 0.2630167668685317,
+      "epoch": 1.9555895865237365,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 6.0975609756097564e-06,
+      "loss": 0.03182641789317131,
+      "mean_token_accuracy": 0.9890161380171776,
+      "num_tokens": 1096216.0,
+      "step": 160
+    },
+    {
+      "entropy": 0.2613411508500576,
+      "epoch": 1.9678407350689127,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 4.8780487804878055e-06,
+      "loss": 0.04600515589118004,
+      "mean_token_accuracy": 0.986037340015173,
+      "num_tokens": 1104876.0,
+      "step": 161
+    },
+    {
+      "entropy": 0.263193734921515,
+      "epoch": 1.9800918836140888,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 3.6585365853658537e-06,
+      "loss": 0.04238295182585716,
+      "mean_token_accuracy": 0.9874502159655094,
+      "num_tokens": 1111427.0,
+      "step": 162
+    },
+    {
+      "entropy": 0.2944263191893697,
+      "epoch": 1.992343032159265,
+      "grad_norm": 0.078125,
+      "learning_rate": 2.4390243902439027e-06,
+      "loss": 0.02235364355146885,
+      "mean_token_accuracy": 0.9943292774260044,
+      "num_tokens": 1118364.0,
+      "step": 163
+    },
+    {
+      "entropy": 0.2737159162759781,
+      "epoch": 2.0,
+      "grad_norm": 0.16796875,
+      "learning_rate": 1.2195121951219514e-06,
+      "loss": 0.05316146835684776,
+      "mean_token_accuracy": 0.985767412185669,
+      "num_tokens": 1123100.0,
+      "step": 164
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 164,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.08553227640832e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-164/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f82b2b4e6dbb457dcbeae9dcaee24c59b51d560b24513b0a9e0c20c6b4ff0bda
+size 5585