stefanocarrera commited on Feb 9

Commit

0a52a0c

verified ·

1 Parent(s): f2a3aaa

Upload folder using huggingface_hub

Browse files

Files changed (29) hide show

.gitattributes +3 -0
README.md +62 -0
adapter_config.json +46 -0
adapter_model.safetensors +3 -0
chat_template.jinja +109 -0
checkpoint-150/README.md +209 -0
checkpoint-150/adapter_config.json +46 -0
checkpoint-150/adapter_model.safetensors +3 -0
checkpoint-150/chat_template.jinja +109 -0
checkpoint-150/optimizer.pt +3 -0
checkpoint-150/rng_state.pth +3 -0
checkpoint-150/scheduler.pt +3 -0
checkpoint-150/tokenizer.json +3 -0
checkpoint-150/tokenizer_config.json +14 -0
checkpoint-150/trainer_state.json +1567 -0
checkpoint-150/training_args.bin +3 -0
checkpoint-164/README.md +209 -0
checkpoint-164/adapter_config.json +46 -0
checkpoint-164/adapter_model.safetensors +3 -0
checkpoint-164/chat_template.jinja +109 -0
checkpoint-164/optimizer.pt +3 -0
checkpoint-164/rng_state.pth +3 -0
checkpoint-164/scheduler.pt +3 -0
checkpoint-164/tokenizer.json +3 -0
checkpoint-164/tokenizer_config.json +14 -0
checkpoint-164/trainer_state.json +1707 -0
checkpoint-164/training_args.bin +3 -0
tokenizer.json +3 -0
tokenizer_config.json +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-164/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+library_name: peft
+model_name: adapters
+tags:
+- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for adapters
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.18.1
+- TRL: 0.27.2
+- Transformers: 5.0.0
+- Pytorch: 2.7.1+cu118
+- Datasets: 4.5.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1cfd52fdf227b7e674c9a251aff7c304abf9a7a8919fce7857b076b93ca8e43
+size 83946192

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

checkpoint-150/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-150/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-150/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e14cd85efc6181de7bf67dfba627ecc3a8e85902faa37a8e88f8f7ad6ddaf4c4
+size 83946192

checkpoint-150/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

checkpoint-150/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2971f43ea71fcc46b772be99c7c59c284402d1ee484c1b80d229b38c50998252
+size 85728997

checkpoint-150/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
+size 14645

checkpoint-150/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:969e38d9df417d5c2e205dc9478ec3ed9bbd0fb36ac227be3ef304a3ef428a63
+size 1465

checkpoint-150/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

checkpoint-150/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}

checkpoint-150/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1567 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.8330781010719757,
+  "eval_steps": 50,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 0.32083135563880205,
+      "epoch": 0.01225114854517611,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0002,
+      "loss": 0.019214527681469917,
+      "mean_token_accuracy": 0.9918519593775272,
+      "num_tokens": 6092.0,
+      "step": 1
+    },
+    {
+      "entropy": 0.3576695416122675,
+      "epoch": 0.02450229709035222,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00019878048780487805,
+      "loss": 0.03324645012617111,
+      "mean_token_accuracy": 0.988272774964571,
+      "num_tokens": 11535.0,
+      "step": 2
+    },
+    {
+      "entropy": 0.33352363388985395,
+      "epoch": 0.036753445635528334,
+      "grad_norm": 0.0272216796875,
+      "learning_rate": 0.0001975609756097561,
+      "loss": 0.0017091021873056889,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 16432.0,
+      "step": 3
+    },
+    {
+      "entropy": 0.35098350048065186,
+      "epoch": 0.04900459418070444,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.00019634146341463416,
+      "loss": 0.00414489908143878,
+      "mean_token_accuracy": 0.9985632188618183,
+      "num_tokens": 20507.0,
+      "step": 4
+    },
+    {
+      "entropy": 0.3005372080951929,
+      "epoch": 0.06125574272588055,
+      "grad_norm": 0.01416015625,
+      "learning_rate": 0.0001951219512195122,
+      "loss": 0.0008560216519981623,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 26122.0,
+      "step": 5
+    },
+    {
+      "entropy": 0.3177621979266405,
+      "epoch": 0.07350689127105667,
+      "grad_norm": 0.008544921875,
+      "learning_rate": 0.00019390243902439025,
+      "loss": 0.0005585744511336088,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 30847.0,
+      "step": 6
+    },
+    {
+      "entropy": 0.27754624653607607,
+      "epoch": 0.08575803981623277,
+      "grad_norm": 0.019775390625,
+      "learning_rate": 0.0001926829268292683,
+      "loss": 0.0012820134870707989,
+      "mean_token_accuracy": 0.9998413696885109,
+      "num_tokens": 36541.0,
+      "step": 7
+    },
+    {
+      "entropy": 0.30307829193770885,
+      "epoch": 0.09800918836140889,
+      "grad_norm": 0.004364013671875,
+      "learning_rate": 0.00019146341463414633,
+      "loss": 0.0003136860905215144,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 41001.0,
+      "step": 8
+    },
+    {
+      "entropy": 0.31226138956844807,
+      "epoch": 0.11026033690658499,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001902439024390244,
+      "loss": 0.006275261752307415,
+      "mean_token_accuracy": 0.9993216060101986,
+      "num_tokens": 45467.0,
+      "step": 9
+    },
+    {
+      "entropy": 0.2779384208843112,
+      "epoch": 0.1225114854517611,
+      "grad_norm": 0.011474609375,
+      "learning_rate": 0.00018902439024390244,
+      "loss": 0.0006869531353004277,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 50478.0,
+      "step": 10
+    },
+    {
+      "entropy": 0.27587867714464664,
+      "epoch": 0.13476263399693722,
+      "grad_norm": 0.00188446044921875,
+      "learning_rate": 0.0001878048780487805,
+      "loss": 0.0001916390028782189,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 56181.0,
+      "step": 11
+    },
+    {
+      "entropy": 0.2948900917544961,
+      "epoch": 0.14701378254211334,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.00018658536585365856,
+      "loss": 0.001886777114123106,
+      "mean_token_accuracy": 0.9998650103807449,
+      "num_tokens": 62946.0,
+      "step": 12
+    },
+    {
+      "entropy": 0.29555963445454836,
+      "epoch": 0.15926493108728942,
+      "grad_norm": 0.005523681640625,
+      "learning_rate": 0.0001853658536585366,
+      "loss": 0.00017441912495996803,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 68436.0,
+      "step": 13
+    },
+    {
+      "entropy": 0.287986209616065,
+      "epoch": 0.17151607963246554,
+      "grad_norm": 0.02001953125,
+      "learning_rate": 0.00018414634146341464,
+      "loss": 0.00017802949878387153,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 73603.0,
+      "step": 14
+    },
+    {
+      "entropy": 0.3127295421436429,
+      "epoch": 0.18376722817764166,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001829268292682927,
+      "loss": 0.0010371531825512648,
+      "mean_token_accuracy": 0.9995941556990147,
+      "num_tokens": 77845.0,
+      "step": 15
+    },
+    {
+      "entropy": 0.2922206539660692,
+      "epoch": 0.19601837672281777,
+      "grad_norm": 0.00118255615234375,
+      "learning_rate": 0.00018170731707317075,
+      "loss": 0.00011905122664757073,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 82744.0,
+      "step": 16
+    },
+    {
+      "entropy": 0.2928574001416564,
+      "epoch": 0.2082695252679939,
+      "grad_norm": 0.0003719329833984375,
+      "learning_rate": 0.0001804878048780488,
+      "loss": 7.616190850967541e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 87453.0,
+      "step": 17
+    },
+    {
+      "entropy": 0.2979039028286934,
+      "epoch": 0.22052067381316998,
+      "grad_norm": 0.0026702880859375,
+      "learning_rate": 0.00017926829268292684,
+      "loss": 0.00012367898307275027,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 92321.0,
+      "step": 18
+    },
+    {
+      "entropy": 0.31858293898403645,
+      "epoch": 0.2327718223583461,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.00017804878048780488,
+      "loss": 0.0006579139153473079,
+      "mean_token_accuracy": 0.9997499994933605,
+      "num_tokens": 97146.0,
+      "step": 19
+    },
+    {
+      "entropy": 0.30853591673076153,
+      "epoch": 0.2450229709035222,
+      "grad_norm": 0.004364013671875,
+      "learning_rate": 0.00017682926829268295,
+      "loss": 0.00014281428593676537,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 101943.0,
+      "step": 20
+    },
+    {
+      "entropy": 0.34037051256746054,
+      "epoch": 0.2572741194486983,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.000175609756097561,
+      "loss": 0.011726096272468567,
+      "mean_token_accuracy": 0.9993422217667103,
+      "num_tokens": 106772.0,
+      "step": 21
+    },
+    {
+      "entropy": 0.29644382931292057,
+      "epoch": 0.26952526799387444,
+      "grad_norm": 0.0023193359375,
+      "learning_rate": 0.00017439024390243903,
+      "loss": 0.00010672100324882194,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 112558.0,
+      "step": 22
+    },
+    {
+      "entropy": 0.3180191367864609,
+      "epoch": 0.28177641653905056,
+      "grad_norm": 0.000675201416015625,
+      "learning_rate": 0.00017317073170731708,
+      "loss": 9.894849790725857e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 117489.0,
+      "step": 23
+    },
+    {
+      "entropy": 0.32946281880140305,
+      "epoch": 0.29402756508422667,
+      "grad_norm": 0.0242919921875,
+      "learning_rate": 0.00017195121951219512,
+      "loss": 0.0029232932720333338,
+      "mean_token_accuracy": 0.9996279776096344,
+      "num_tokens": 123010.0,
+      "step": 24
+    },
+    {
+      "entropy": 0.3180750487372279,
+      "epoch": 0.30627871362940273,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.0001707317073170732,
+      "loss": 0.0015810562763363123,
+      "mean_token_accuracy": 0.9990344606339931,
+      "num_tokens": 127716.0,
+      "step": 25
+    },
+    {
+      "entropy": 0.31262985058128834,
+      "epoch": 0.31852986217457885,
+      "grad_norm": 0.0027313232421875,
+      "learning_rate": 0.00016951219512195123,
+      "loss": 0.00019670175970532,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 132372.0,
+      "step": 26
+    },
+    {
+      "entropy": 0.2831157138571143,
+      "epoch": 0.33078101071975496,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00016829268292682927,
+      "loss": 0.003187144873663783,
+      "mean_token_accuracy": 0.9994877055287361,
+      "num_tokens": 137028.0,
+      "step": 27
+    },
+    {
+      "entropy": 0.3106652954593301,
+      "epoch": 0.3430321592649311,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.00016707317073170731,
+      "loss": 0.004998125601559877,
+      "mean_token_accuracy": 0.9980670101940632,
+      "num_tokens": 142088.0,
+      "step": 28
+    },
+    {
+      "entropy": 0.31454288959503174,
+      "epoch": 0.3552833078101072,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.00016585365853658536,
+      "loss": 0.000461318384623155,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 147481.0,
+      "step": 29
+    },
+    {
+      "entropy": 0.33650430012494326,
+      "epoch": 0.3675344563552833,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.00016463414634146343,
+      "loss": 0.0005614800029434264,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 152973.0,
+      "step": 30
+    },
+    {
+      "entropy": 0.33513325452804565,
+      "epoch": 0.37978560490045943,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 0.00016341463414634147,
+      "loss": 0.00020872258755844086,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 156786.0,
+      "step": 31
+    },
+    {
+      "entropy": 0.34442581795156,
+      "epoch": 0.39203675344563554,
+      "grad_norm": 0.0159912109375,
+      "learning_rate": 0.00016219512195121954,
+      "loss": 0.00043797443504445255,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 162859.0,
+      "step": 32
+    },
+    {
+      "entropy": 0.34709672816097736,
+      "epoch": 0.40428790199081166,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.00016097560975609758,
+      "loss": 0.0008612321689724922,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 167969.0,
+      "step": 33
+    },
+    {
+      "entropy": 0.31636961828917265,
+      "epoch": 0.4165390505359878,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.00015975609756097562,
+      "loss": 0.001623529358766973,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 172518.0,
+      "step": 34
+    },
+    {
+      "entropy": 0.341240718960762,
+      "epoch": 0.42879019908116384,
+      "grad_norm": 0.0089111328125,
+      "learning_rate": 0.00015853658536585366,
+      "loss": 0.0004598334198817611,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 177085.0,
+      "step": 35
+    },
+    {
+      "entropy": 0.3331515807658434,
+      "epoch": 0.44104134762633995,
+      "grad_norm": 0.0137939453125,
+      "learning_rate": 0.00015731707317073173,
+      "loss": 0.00047711117076687515,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 181617.0,
+      "step": 36
+    },
+    {
+      "entropy": 0.2969168536365032,
+      "epoch": 0.45329249617151607,
+      "grad_norm": 0.0296630859375,
+      "learning_rate": 0.00015609756097560978,
+      "loss": 0.0018673602025955915,
+      "mean_token_accuracy": 0.9982142858207226,
+      "num_tokens": 186836.0,
+      "step": 37
+    },
+    {
+      "entropy": 0.3208611598238349,
+      "epoch": 0.4655436447166922,
+      "grad_norm": 0.0034027099609375,
+      "learning_rate": 0.00015487804878048782,
+      "loss": 0.00018661899957805872,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 191224.0,
+      "step": 38
+    },
+    {
+      "entropy": 0.296407300978899,
+      "epoch": 0.4777947932618683,
+      "grad_norm": 0.003570556640625,
+      "learning_rate": 0.00015365853658536586,
+      "loss": 0.0001632017083466053,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 195926.0,
+      "step": 39
+    },
+    {
+      "entropy": 0.32142599392682314,
+      "epoch": 0.4900459418070444,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001524390243902439,
+      "loss": 0.0039696223102509975,
+      "mean_token_accuracy": 0.9992866478860378,
+      "num_tokens": 200772.0,
+      "step": 40
+    },
+    {
+      "entropy": 0.3037592498585582,
+      "epoch": 0.5022970903522205,
+      "grad_norm": 0.0026092529296875,
+      "learning_rate": 0.00015121951219512197,
+      "loss": 0.00013867147208657116,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 204499.0,
+      "step": 41
+    },
+    {
+      "entropy": 0.31665132474154234,
+      "epoch": 0.5145482388973966,
+      "grad_norm": 0.004730224609375,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.00025882094632834196,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 208814.0,
+      "step": 42
+    },
+    {
+      "entropy": 0.33023010194301605,
+      "epoch": 0.5267993874425727,
+      "grad_norm": 0.001922607421875,
+      "learning_rate": 0.00014878048780487806,
+      "loss": 0.00019074659212492406,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 213907.0,
+      "step": 43
+    },
+    {
+      "entropy": 0.334543508477509,
+      "epoch": 0.5390505359877489,
+      "grad_norm": 0.0018157958984375,
+      "learning_rate": 0.0001475609756097561,
+      "loss": 0.00011566472676349804,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 218988.0,
+      "step": 44
+    },
+    {
+      "entropy": 0.3078083451837301,
+      "epoch": 0.5513016845329249,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.00014634146341463414,
+      "loss": 0.0022110757417976856,
+      "mean_token_accuracy": 0.9987903237342834,
+      "num_tokens": 223595.0,
+      "step": 45
+    },
+    {
+      "entropy": 0.32667472772300243,
+      "epoch": 0.5635528330781011,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001451219512195122,
+      "loss": 0.0010719874408096075,
+      "mean_token_accuracy": 0.9991953931748867,
+      "num_tokens": 228244.0,
+      "step": 46
+    },
+    {
+      "entropy": 0.3273861287161708,
+      "epoch": 0.5758039816232772,
+      "grad_norm": 0.00057220458984375,
+      "learning_rate": 0.00014390243902439025,
+      "loss": 6.594268779736012e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 232606.0,
+      "step": 47
+    },
+    {
+      "entropy": 0.31728990003466606,
+      "epoch": 0.5880551301684533,
+      "grad_norm": 0.0003185272216796875,
+      "learning_rate": 0.0001426829268292683,
+      "loss": 8.574798266636208e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 236563.0,
+      "step": 48
+    },
+    {
+      "entropy": 0.34826087579131126,
+      "epoch": 0.6003062787136294,
+      "grad_norm": 0.00390625,
+      "learning_rate": 0.00014146341463414634,
+      "loss": 0.00015243196685332805,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 241214.0,
+      "step": 49
+    },
+    {
+      "entropy": 0.3367287954315543,
+      "epoch": 0.6125574272588055,
+      "grad_norm": 0.003265380859375,
+      "learning_rate": 0.00014024390243902438,
+      "loss": 0.0001341242023045197,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 245200.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.6125574272588055,
+      "eval_entropy": 0.3212364659361217,
+      "eval_loss": 0.0014040147652849555,
+      "eval_mean_token_accuracy": 0.9998166846192401,
+      "eval_num_tokens": 245200.0,
+      "eval_runtime": 51.1353,
+      "eval_samples_per_second": 1.349,
+      "eval_steps_per_second": 1.349,
+      "step": 50
+    },
+    {
+      "entropy": 0.3274610061198473,
+      "epoch": 0.6248085758039816,
+      "grad_norm": 0.000518798828125,
+      "learning_rate": 0.00013902439024390245,
+      "loss": 6.213193410076201e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 249761.0,
+      "step": 51
+    },
+    {
+      "entropy": 0.3302043145522475,
+      "epoch": 0.6370597243491577,
+      "grad_norm": 0.00067901611328125,
+      "learning_rate": 0.0001378048780487805,
+      "loss": 7.391967665171251e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 254787.0,
+      "step": 52
+    },
+    {
+      "entropy": 0.3345805983990431,
+      "epoch": 0.6493108728943339,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.00013658536585365856,
+      "loss": 0.008045142516493797,
+      "mean_token_accuracy": 0.9975476562976837,
+      "num_tokens": 260287.0,
+      "step": 53
+    },
+    {
+      "entropy": 0.3093695640563965,
+      "epoch": 0.6615620214395099,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001353658536585366,
+      "loss": 0.0016300748102366924,
+      "mean_token_accuracy": 0.9998249299824238,
+      "num_tokens": 264810.0,
+      "step": 54
+    },
+    {
+      "entropy": 0.33090174850076437,
+      "epoch": 0.6738131699846861,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.00013414634146341464,
+      "loss": 0.0037348291371017694,
+      "mean_token_accuracy": 0.9990433678030968,
+      "num_tokens": 270386.0,
+      "step": 55
+    },
+    {
+      "entropy": 0.3455248447135091,
+      "epoch": 0.6860643185298622,
+      "grad_norm": 0.0301513671875,
+      "learning_rate": 0.0001329268292682927,
+      "loss": 0.0006253286846913397,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 274391.0,
+      "step": 56
+    },
+    {
+      "entropy": 0.3408086858689785,
+      "epoch": 0.6983154670750383,
+      "grad_norm": 0.0033111572265625,
+      "learning_rate": 0.00013170731707317076,
+      "loss": 0.00020847572886850685,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 279716.0,
+      "step": 57
+    },
+    {
+      "entropy": 0.29423840064555407,
+      "epoch": 0.7105666156202144,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001304878048780488,
+      "loss": 0.005600863602012396,
+      "mean_token_accuracy": 0.998680267482996,
+      "num_tokens": 285404.0,
+      "step": 58
+    },
+    {
+      "entropy": 0.33689095824956894,
+      "epoch": 0.7228177641653905,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.00012926829268292684,
+      "loss": 0.009100214578211308,
+      "mean_token_accuracy": 0.9967310577630997,
+      "num_tokens": 290021.0,
+      "step": 59
+    },
+    {
+      "entropy": 0.3336018780246377,
+      "epoch": 0.7350689127105666,
+      "grad_norm": 0.005889892578125,
+      "learning_rate": 0.00012804878048780488,
+      "loss": 0.00015729578444734216,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 294890.0,
+      "step": 60
+    },
+    {
+      "entropy": 0.30060291569679976,
+      "epoch": 0.7473200612557427,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.00012682926829268293,
+      "loss": 0.00039864826248958707,
+      "mean_token_accuracy": 0.9993686862289906,
+      "num_tokens": 300384.0,
+      "step": 61
+    },
+    {
+      "entropy": 0.36021818965673447,
+      "epoch": 0.7595712098009189,
+      "grad_norm": 0.0025634765625,
+      "learning_rate": 0.000125609756097561,
+      "loss": 0.00016568033606745303,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 305805.0,
+      "step": 62
+    },
+    {
+      "entropy": 0.32536453381180763,
+      "epoch": 0.7718223583460949,
+      "grad_norm": 0.001800537109375,
+      "learning_rate": 0.00012439024390243904,
+      "loss": 0.00014585268218070269,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 310233.0,
+      "step": 63
+    },
+    {
+      "entropy": 0.31967335008084774,
+      "epoch": 0.7840735068912711,
+      "grad_norm": 0.0010223388671875,
+      "learning_rate": 0.00012317073170731708,
+      "loss": 0.00010060967179015279,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 314234.0,
+      "step": 64
+    },
+    {
+      "entropy": 0.34358128905296326,
+      "epoch": 0.7963246554364471,
+      "grad_norm": 0.000743865966796875,
+      "learning_rate": 0.00012195121951219512,
+      "loss": 9.478208812652156e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 319186.0,
+      "step": 65
+    },
+    {
+      "entropy": 0.33988895174115896,
+      "epoch": 0.8085758039816233,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.00012073170731707318,
+      "loss": 0.0011607923079282045,
+      "mean_token_accuracy": 0.9995629377663136,
+      "num_tokens": 324710.0,
+      "step": 66
+    },
+    {
+      "entropy": 0.3078791871666908,
+      "epoch": 0.8208269525267994,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.00011951219512195122,
+      "loss": 0.016102174296975136,
+      "mean_token_accuracy": 0.9935315921902657,
+      "num_tokens": 329942.0,
+      "step": 67
+    },
+    {
+      "entropy": 0.3587793167680502,
+      "epoch": 0.8330781010719756,
+      "grad_norm": 0.002716064453125,
+      "learning_rate": 0.00011829268292682926,
+      "loss": 0.0001911829021992162,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 334487.0,
+      "step": 68
+    },
+    {
+      "entropy": 0.360817888751626,
+      "epoch": 0.8453292496171516,
+      "grad_norm": 0.003753662109375,
+      "learning_rate": 0.00011707317073170732,
+      "loss": 0.00026575953233987093,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 338184.0,
+      "step": 69
+    },
+    {
+      "entropy": 0.3788213599473238,
+      "epoch": 0.8575803981623277,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.00011585365853658536,
+      "loss": 0.007251895032823086,
+      "mean_token_accuracy": 0.997805867344141,
+      "num_tokens": 342594.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.37989665009081364,
+      "epoch": 0.8698315467075038,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.00011463414634146342,
+      "loss": 0.001519644632935524,
+      "mean_token_accuracy": 0.9997807033360004,
+      "num_tokens": 347798.0,
+      "step": 71
+    },
+    {
+      "entropy": 0.35538383200764656,
+      "epoch": 0.8820826952526799,
+      "grad_norm": 0.0038604736328125,
+      "learning_rate": 0.00011341463414634146,
+      "loss": 0.00030194621649570763,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 352122.0,
+      "step": 72
+    },
+    {
+      "entropy": 0.36578258499503136,
+      "epoch": 0.8943338437978561,
+      "grad_norm": 0.02001953125,
+      "learning_rate": 0.00011219512195121953,
+      "loss": 0.0018432819051668048,
+      "mean_token_accuracy": 0.9997568093240261,
+      "num_tokens": 357944.0,
+      "step": 73
+    },
+    {
+      "entropy": 0.3363148244097829,
+      "epoch": 0.9065849923430321,
+      "grad_norm": 0.01214599609375,
+      "learning_rate": 0.00011097560975609757,
+      "loss": 0.0004945008549839258,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 363815.0,
+      "step": 74
+    },
+    {
+      "entropy": 0.3567014401778579,
+      "epoch": 0.9188361408882083,
+      "grad_norm": 0.00160980224609375,
+      "learning_rate": 0.00010975609756097563,
+      "loss": 0.0002087215252686292,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 368871.0,
+      "step": 75
+    },
+    {
+      "entropy": 0.3798025632277131,
+      "epoch": 0.9310872894333844,
+      "grad_norm": 0.0242919921875,
+      "learning_rate": 0.00010853658536585367,
+      "loss": 0.0011810146970674396,
+      "mean_token_accuracy": 0.999143835157156,
+      "num_tokens": 373671.0,
+      "step": 76
+    },
+    {
+      "entropy": 0.3385667558759451,
+      "epoch": 0.9433384379785605,
+      "grad_norm": 0.00164031982421875,
+      "learning_rate": 0.00010731707317073172,
+      "loss": 0.00021391667542047799,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 379038.0,
+      "step": 77
+    },
+    {
+      "entropy": 0.37137152813374996,
+      "epoch": 0.9555895865237366,
+      "grad_norm": 0.0194091796875,
+      "learning_rate": 0.00010609756097560977,
+      "loss": 0.0009015509858727455,
+      "mean_token_accuracy": 0.9992977529764175,
+      "num_tokens": 384253.0,
+      "step": 78
+    },
+    {
+      "entropy": 0.35634181648492813,
+      "epoch": 0.9678407350689127,
+      "grad_norm": 0.002349853515625,
+      "learning_rate": 0.00010487804878048781,
+      "loss": 0.0003007323248311877,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 388348.0,
+      "step": 79
+    },
+    {
+      "entropy": 0.3363165808841586,
+      "epoch": 0.9800918836140888,
+      "grad_norm": 0.013916015625,
+      "learning_rate": 0.00010365853658536586,
+      "loss": 0.0015124119818210602,
+      "mean_token_accuracy": 0.999507874250412,
+      "num_tokens": 394214.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.34769035689532757,
+      "epoch": 0.9923430321592649,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001024390243902439,
+      "loss": 0.0008837911300361156,
+      "mean_token_accuracy": 0.9992187507450581,
+      "num_tokens": 399114.0,
+      "step": 81
+    },
+    {
+      "entropy": 0.34723484665155413,
+      "epoch": 1.0,
+      "grad_norm": 0.002288818359375,
+      "learning_rate": 0.00010121951219512196,
+      "loss": 0.0002318796032341197,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 402130.0,
+      "step": 82
+    },
+    {
+      "entropy": 0.3677198924124241,
+      "epoch": 1.0122511485451762,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0028767124749720097,
+      "mean_token_accuracy": 0.9997509978711605,
+      "num_tokens": 406761.0,
+      "step": 83
+    },
+    {
+      "entropy": 0.3296260507777333,
+      "epoch": 1.0245022970903521,
+      "grad_norm": 0.0016326904296875,
+      "learning_rate": 9.878048780487805e-05,
+      "loss": 0.00020801745995413512,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 411367.0,
+      "step": 84
+    },
+    {
+      "entropy": 0.36815651040524244,
+      "epoch": 1.0367534456355283,
+      "grad_norm": 0.00299072265625,
+      "learning_rate": 9.75609756097561e-05,
+      "loss": 0.00034169916762039065,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 417768.0,
+      "step": 85
+    },
+    {
+      "entropy": 0.33015719801187515,
+      "epoch": 1.0490045941807045,
+      "grad_norm": 0.0019683837890625,
+      "learning_rate": 9.634146341463415e-05,
+      "loss": 0.0002285851223859936,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 421738.0,
+      "step": 86
+    },
+    {
+      "entropy": 0.33297139778733253,
+      "epoch": 1.0612557427258806,
+      "grad_norm": 0.0003604888916015625,
+      "learning_rate": 9.51219512195122e-05,
+      "loss": 0.00012145948858233169,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 426854.0,
+      "step": 87
+    },
+    {
+      "entropy": 0.4070947393774986,
+      "epoch": 1.0735068912710566,
+      "grad_norm": 0.017333984375,
+      "learning_rate": 9.390243902439024e-05,
+      "loss": 0.0016109611606225371,
+      "mean_token_accuracy": 0.9998486675322056,
+      "num_tokens": 431083.0,
+      "step": 88
+    },
+    {
+      "entropy": 0.3781026881188154,
+      "epoch": 1.0857580398162328,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 9.26829268292683e-05,
+      "loss": 0.003159651067107916,
+      "mean_token_accuracy": 0.9989801794290543,
+      "num_tokens": 435694.0,
+      "step": 89
+    },
+    {
+      "entropy": 0.3439221568405628,
+      "epoch": 1.098009188361409,
+      "grad_norm": 0.000949859619140625,
+      "learning_rate": 9.146341463414635e-05,
+      "loss": 0.00018103225738741457,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 440578.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.38779534585773945,
+      "epoch": 1.110260336906585,
+      "grad_norm": 0.0142822265625,
+      "learning_rate": 9.02439024390244e-05,
+      "loss": 0.002015941310673952,
+      "mean_token_accuracy": 0.9984939768910408,
+      "num_tokens": 445238.0,
+      "step": 91
+    },
+    {
+      "entropy": 0.3697750475257635,
+      "epoch": 1.122511485451761,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 8.902439024390244e-05,
+      "loss": 0.006127167027443647,
+      "mean_token_accuracy": 0.9989957921206951,
+      "num_tokens": 449993.0,
+      "step": 92
+    },
+    {
+      "entropy": 0.34917816519737244,
+      "epoch": 1.1347626339969372,
+      "grad_norm": 0.0037384033203125,
+      "learning_rate": 8.78048780487805e-05,
+      "loss": 0.00024314325128216296,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 454976.0,
+      "step": 93
+    },
+    {
+      "entropy": 0.3524725306779146,
+      "epoch": 1.1470137825421134,
+      "grad_norm": 0.00104522705078125,
+      "learning_rate": 8.658536585365854e-05,
+      "loss": 0.00014462518447544426,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 459671.0,
+      "step": 94
+    },
+    {
+      "entropy": 0.3524913527071476,
+      "epoch": 1.1592649310872893,
+      "grad_norm": 0.000782012939453125,
+      "learning_rate": 8.53658536585366e-05,
+      "loss": 0.0001363266637781635,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 464310.0,
+      "step": 95
+    },
+    {
+      "entropy": 0.33474782202392817,
+      "epoch": 1.1715160796324655,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 8.414634146341464e-05,
+      "loss": 0.006995758973062038,
+      "mean_token_accuracy": 0.997385773807764,
+      "num_tokens": 468855.0,
+      "step": 96
+    },
+    {
+      "entropy": 0.34024662896990776,
+      "epoch": 1.1837672281776417,
+      "grad_norm": 0.000762939453125,
+      "learning_rate": 8.292682926829268e-05,
+      "loss": 0.00012206919927848503,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 473729.0,
+      "step": 97
+    },
+    {
+      "entropy": 0.35474758967757225,
+      "epoch": 1.1960183767228179,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 8.170731707317073e-05,
+      "loss": 0.0028819667641073465,
+      "mean_token_accuracy": 0.9993131868541241,
+      "num_tokens": 479034.0,
+      "step": 98
+    },
+    {
+      "entropy": 0.3854726795107126,
+      "epoch": 1.2082695252679938,
+      "grad_norm": 0.00046539306640625,
+      "learning_rate": 8.048780487804879e-05,
+      "loss": 9.724850679049268e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 484808.0,
+      "step": 99
+    },
+    {
+      "entropy": 0.31455889251083136,
+      "epoch": 1.22052067381317,
+      "grad_norm": 0.00958251953125,
+      "learning_rate": 7.926829268292683e-05,
+      "loss": 0.0009833230869844556,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 489519.0,
+      "step": 100
+    },
+    {
+      "epoch": 1.22052067381317,
+      "eval_entropy": 0.3496412036643512,
+      "eval_loss": 0.0005010219174437225,
+      "eval_mean_token_accuracy": 0.9998490343923154,
+      "eval_num_tokens": 489519.0,
+      "eval_runtime": 51.1698,
+      "eval_samples_per_second": 1.348,
+      "eval_steps_per_second": 1.348,
+      "step": 100
+    },
+    {
+      "entropy": 0.36140021588653326,
+      "epoch": 1.2327718223583461,
+      "grad_norm": 0.0006256103515625,
+      "learning_rate": 7.804878048780489e-05,
+      "loss": 0.00011641360470093787,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 494754.0,
+      "step": 101
+    },
+    {
+      "entropy": 0.33879768289625645,
+      "epoch": 1.245022970903522,
+      "grad_norm": 0.00037384033203125,
+      "learning_rate": 7.682926829268293e-05,
+      "loss": 0.00010185636347159743,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 499834.0,
+      "step": 102
+    },
+    {
+      "entropy": 0.36160764284431934,
+      "epoch": 1.2572741194486983,
+      "grad_norm": 0.00103759765625,
+      "learning_rate": 7.560975609756099e-05,
+      "loss": 0.00012021363363601267,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 505264.0,
+      "step": 103
+    },
+    {
+      "entropy": 0.3344170628115535,
+      "epoch": 1.2695252679938744,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 7.439024390243903e-05,
+      "loss": 0.00044063289533369243,
+      "mean_token_accuracy": 0.9995915032923222,
+      "num_tokens": 510257.0,
+      "step": 104
+    },
+    {
+      "entropy": 0.36058457661420107,
+      "epoch": 1.2817764165390506,
+      "grad_norm": 0.01336669921875,
+      "learning_rate": 7.317073170731707e-05,
+      "loss": 0.0015127016231417656,
+      "mean_token_accuracy": 0.9993556700646877,
+      "num_tokens": 514490.0,
+      "step": 105
+    },
+    {
+      "entropy": 0.33314079977571964,
+      "epoch": 1.2940275650842268,
+      "grad_norm": 0.0011749267578125,
+      "learning_rate": 7.195121951219513e-05,
+      "loss": 0.00011071039625676349,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 519508.0,
+      "step": 106
+    },
+    {
+      "entropy": 0.3573821699246764,
+      "epoch": 1.3062787136294027,
+      "grad_norm": 0.0003986358642578125,
+      "learning_rate": 7.073170731707317e-05,
+      "loss": 0.00011713722778949887,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 524370.0,
+      "step": 107
+    },
+    {
+      "entropy": 0.3524222169071436,
+      "epoch": 1.318529862174579,
+      "grad_norm": 0.0003108978271484375,
+      "learning_rate": 6.951219512195122e-05,
+      "loss": 9.721294190967456e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 528970.0,
+      "step": 108
+    },
+    {
+      "entropy": 0.3544369339942932,
+      "epoch": 1.3307810107197549,
+      "grad_norm": 0.005950927734375,
+      "learning_rate": 6.829268292682928e-05,
+      "loss": 0.0003032644744962454,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 533938.0,
+      "step": 109
+    },
+    {
+      "entropy": 0.3304135613143444,
+      "epoch": 1.343032159264931,
+      "grad_norm": 0.000965118408203125,
+      "learning_rate": 6.707317073170732e-05,
+      "loss": 0.00012454115494620055,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 539360.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.3306180518120527,
+      "epoch": 1.3552833078101072,
+      "grad_norm": 0.0011444091796875,
+      "learning_rate": 6.585365853658538e-05,
+      "loss": 0.00013282139843795449,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 543728.0,
+      "step": 111
+    },
+    {
+      "entropy": 0.3708817586302757,
+      "epoch": 1.3675344563552834,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 6.463414634146342e-05,
+      "loss": 0.004361060913652182,
+      "mean_token_accuracy": 0.9983282573521137,
+      "num_tokens": 548161.0,
+      "step": 112
+    },
+    {
+      "entropy": 0.35475645773112774,
+      "epoch": 1.3797856049004595,
+      "grad_norm": 0.01361083984375,
+      "learning_rate": 6.341463414634146e-05,
+      "loss": 0.0014049941673874855,
+      "mean_token_accuracy": 0.998511902987957,
+      "num_tokens": 553690.0,
+      "step": 113
+    },
+    {
+      "entropy": 0.3360502114519477,
+      "epoch": 1.3920367534456355,
+      "grad_norm": 0.00023746490478515625,
+      "learning_rate": 6.219512195121952e-05,
+      "loss": 8.739449549466372e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 558474.0,
+      "step": 114
+    },
+    {
+      "entropy": 0.35608484130352736,
+      "epoch": 1.4042879019908117,
+      "grad_norm": 0.0009765625,
+      "learning_rate": 6.097560975609756e-05,
+      "loss": 0.00013572419993579388,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 563962.0,
+      "step": 115
+    },
+    {
+      "entropy": 0.3591584851965308,
+      "epoch": 1.4165390505359878,
+      "grad_norm": 0.00103759765625,
+      "learning_rate": 5.975609756097561e-05,
+      "loss": 0.0001251319336006418,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 568300.0,
+      "step": 116
+    },
+    {
+      "entropy": 0.32333058025687933,
+      "epoch": 1.4287901990811638,
+      "grad_norm": 0.0002803802490234375,
+      "learning_rate": 5.853658536585366e-05,
+      "loss": 8.771298598730937e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 572892.0,
+      "step": 117
+    },
+    {
+      "entropy": 0.3675775118172169,
+      "epoch": 1.44104134762634,
+      "grad_norm": 0.0014495849609375,
+      "learning_rate": 5.731707317073171e-05,
+      "loss": 0.00014175268006511033,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 577889.0,
+      "step": 118
+    },
+    {
+      "entropy": 0.37294205371290445,
+      "epoch": 1.4532924961715161,
+      "grad_norm": 0.00099945068359375,
+      "learning_rate": 5.6097560975609764e-05,
+      "loss": 8.949499897425994e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 583125.0,
+      "step": 119
+    },
+    {
+      "entropy": 0.3598701385781169,
+      "epoch": 1.4655436447166923,
+      "grad_norm": 0.006011962890625,
+      "learning_rate": 5.487804878048781e-05,
+      "loss": 0.00018555490532889962,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 587853.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.3222861588001251,
+      "epoch": 1.4777947932618682,
+      "grad_norm": 0.0174560546875,
+      "learning_rate": 5.365853658536586e-05,
+      "loss": 0.0032859183847904205,
+      "mean_token_accuracy": 0.9993932023644447,
+      "num_tokens": 592286.0,
+      "step": 121
+    },
+    {
+      "entropy": 0.3423085901886225,
+      "epoch": 1.4900459418070444,
+      "grad_norm": 0.000354766845703125,
+      "learning_rate": 5.2439024390243904e-05,
+      "loss": 9.40198588068597e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 597048.0,
+      "step": 122
+    },
+    {
+      "entropy": 0.3356657065451145,
+      "epoch": 1.5022970903522204,
+      "grad_norm": 0.00177764892578125,
+      "learning_rate": 5.121951219512195e-05,
+      "loss": 0.00018200451449956745,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 601352.0,
+      "step": 123
+    },
+    {
+      "entropy": 0.34760472923517227,
+      "epoch": 1.5145482388973965,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 5e-05,
+      "loss": 0.0016977301565930247,
+      "mean_token_accuracy": 0.9993686862289906,
+      "num_tokens": 606645.0,
+      "step": 124
+    },
+    {
+      "entropy": 0.34292006585747004,
+      "epoch": 1.5267993874425727,
+      "grad_norm": 0.00048828125,
+      "learning_rate": 4.878048780487805e-05,
+      "loss": 0.00011081612319685519,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 612407.0,
+      "step": 125
+    },
+    {
+      "entropy": 0.3405891256406903,
+      "epoch": 1.5390505359877489,
+      "grad_norm": 0.0098876953125,
+      "learning_rate": 4.75609756097561e-05,
+      "loss": 0.0002546444011386484,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 617229.0,
+      "step": 126
+    },
+    {
+      "entropy": 0.39804220758378506,
+      "epoch": 1.551301684532925,
+      "grad_norm": 0.00177001953125,
+      "learning_rate": 4.634146341463415e-05,
+      "loss": 0.00020191296061966568,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 622355.0,
+      "step": 127
+    },
+    {
+      "entropy": 0.38183566741645336,
+      "epoch": 1.5635528330781012,
+      "grad_norm": 0.0020294189453125,
+      "learning_rate": 4.51219512195122e-05,
+      "loss": 0.0002027210284722969,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 627269.0,
+      "step": 128
+    },
+    {
+      "entropy": 0.32283751480281353,
+      "epoch": 1.5758039816232772,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 4.390243902439025e-05,
+      "loss": 0.0007472627912648022,
+      "mean_token_accuracy": 0.9991987161338329,
+      "num_tokens": 631454.0,
+      "step": 129
+    },
+    {
+      "entropy": 0.31161691434681416,
+      "epoch": 1.5880551301684533,
+      "grad_norm": 0.00174713134765625,
+      "learning_rate": 4.26829268292683e-05,
+      "loss": 0.0001439937186660245,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 636502.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.3435830660164356,
+      "epoch": 1.6003062787136293,
+      "grad_norm": 0.0308837890625,
+      "learning_rate": 4.146341463414634e-05,
+      "loss": 0.004759644623845816,
+      "mean_token_accuracy": 0.9986401423811913,
+      "num_tokens": 641264.0,
+      "step": 131
+    },
+    {
+      "entropy": 0.35103026777505875,
+      "epoch": 1.6125574272588055,
+      "grad_norm": 0.014404296875,
+      "learning_rate": 4.0243902439024395e-05,
+      "loss": 0.002162080956622958,
+      "mean_token_accuracy": 0.9997351691126823,
+      "num_tokens": 646377.0,
+      "step": 132
+    },
+    {
+      "entropy": 0.2977801924571395,
+      "epoch": 1.6248085758039816,
+      "grad_norm": 0.0003948211669921875,
+      "learning_rate": 3.9024390243902444e-05,
+      "loss": 0.00010242296411888674,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 650767.0,
+      "step": 133
+    },
+    {
+      "entropy": 0.3230333384126425,
+      "epoch": 1.6370597243491578,
+      "grad_norm": 0.00138092041015625,
+      "learning_rate": 3.780487804878049e-05,
+      "loss": 0.00015076796989887953,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 655169.0,
+      "step": 134
+    },
+    {
+      "entropy": 0.341650640591979,
+      "epoch": 1.649310872894334,
+      "grad_norm": 0.000942230224609375,
+      "learning_rate": 3.6585365853658535e-05,
+      "loss": 0.00014208458014763892,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 660290.0,
+      "step": 135
+    },
+    {
+      "entropy": 0.3829786740243435,
+      "epoch": 1.66156202143951,
+      "grad_norm": 0.00069427490234375,
+      "learning_rate": 3.5365853658536584e-05,
+      "loss": 0.00014442864630836993,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 664473.0,
+      "step": 136
+    },
+    {
+      "entropy": 0.36254822462797165,
+      "epoch": 1.673813169984686,
+      "grad_norm": 0.000873565673828125,
+      "learning_rate": 3.414634146341464e-05,
+      "loss": 0.00012407865142449737,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 669356.0,
+      "step": 137
+    },
+    {
+      "entropy": 0.3526885788887739,
+      "epoch": 1.686064318529862,
+      "grad_norm": 0.01544189453125,
+      "learning_rate": 3.292682926829269e-05,
+      "loss": 0.0013645780272781849,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 674911.0,
+      "step": 138
+    },
+    {
+      "entropy": 0.3426882065832615,
+      "epoch": 1.6983154670750382,
+      "grad_norm": 0.00136566162109375,
+      "learning_rate": 3.170731707317073e-05,
+      "loss": 0.00017942595877684653,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 679692.0,
+      "step": 139
+    },
+    {
+      "entropy": 0.36831479519605637,
+      "epoch": 1.7105666156202144,
+      "grad_norm": 0.01104736328125,
+      "learning_rate": 3.048780487804878e-05,
+      "loss": 0.00024098601716104895,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 685048.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.3340944442898035,
+      "epoch": 1.7228177641653906,
+      "grad_norm": 0.000865936279296875,
+      "learning_rate": 2.926829268292683e-05,
+      "loss": 0.00013921498612035066,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 689396.0,
+      "step": 141
+    },
+    {
+      "entropy": 0.34801830537617207,
+      "epoch": 1.7350689127105667,
+      "grad_norm": 0.000965118408203125,
+      "learning_rate": 2.8048780487804882e-05,
+      "loss": 0.0001655905944062397,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 693189.0,
+      "step": 142
+    },
+    {
+      "entropy": 0.35556044429540634,
+      "epoch": 1.7473200612557427,
+      "grad_norm": 0.0019073486328125,
+      "learning_rate": 2.682926829268293e-05,
+      "loss": 0.00019044376676902175,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 697603.0,
+      "step": 143
+    },
+    {
+      "entropy": 0.3632572125643492,
+      "epoch": 1.7595712098009189,
+      "grad_norm": 0.00106048583984375,
+      "learning_rate": 2.5609756097560977e-05,
+      "loss": 0.00017029076116159558,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 703050.0,
+      "step": 144
+    },
+    {
+      "entropy": 0.35750158317387104,
+      "epoch": 1.7718223583460948,
+      "grad_norm": 0.0130615234375,
+      "learning_rate": 2.4390243902439026e-05,
+      "loss": 0.0015582278138026595,
+      "mean_token_accuracy": 0.999015748500824,
+      "num_tokens": 707862.0,
+      "step": 145
+    },
+    {
+      "entropy": 0.36597106605768204,
+      "epoch": 1.784073506891271,
+      "grad_norm": 0.000583648681640625,
+      "learning_rate": 2.3170731707317075e-05,
+      "loss": 0.00013483221118804067,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 712821.0,
+      "step": 146
+    },
+    {
+      "entropy": 0.35171396005898714,
+      "epoch": 1.7963246554364471,
+      "grad_norm": 0.00157928466796875,
+      "learning_rate": 2.1951219512195124e-05,
+      "loss": 0.00012708510621450841,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 718453.0,
+      "step": 147
+    },
+    {
+      "entropy": 0.3596025314182043,
+      "epoch": 1.8085758039816233,
+      "grad_norm": 0.000762939453125,
+      "learning_rate": 2.073170731707317e-05,
+      "loss": 0.00011665250349324197,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 723810.0,
+      "step": 148
+    },
+    {
+      "entropy": 0.3876404408365488,
+      "epoch": 1.8208269525267995,
+      "grad_norm": 0.00170135498046875,
+      "learning_rate": 1.9512195121951222e-05,
+      "loss": 0.00014468679728452116,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 728126.0,
+      "step": 149
+    },
+    {
+      "entropy": 0.3599753547459841,
+      "epoch": 1.8330781010719757,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 1.8292682926829268e-05,
+      "loss": 0.0008729367982596159,
+      "mean_token_accuracy": 0.9996936284005642,
+      "num_tokens": 733917.0,
+      "step": 150
+    },
+    {
+      "epoch": 1.8330781010719757,
+      "eval_entropy": 0.3504998228256253,
+      "eval_loss": 0.0005272864946164191,
+      "eval_mean_token_accuracy": 0.9998166846192401,
+      "eval_num_tokens": 733917.0,
+      "eval_runtime": 51.0847,
+      "eval_samples_per_second": 1.351,
+      "eval_steps_per_second": 1.351,
+      "step": 150
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 164,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.3232647063527424e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-150/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e57c6e38c07ad1986e976ce1287e590ea69e06de9f14255a4dbbd33296bac1cc
+size 5585

checkpoint-164/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:meta-llama/Meta-Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-164/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-164/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1cfd52fdf227b7e674c9a251aff7c304abf9a7a8919fce7857b076b93ca8e43
+size 83946192

checkpoint-164/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

checkpoint-164/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:790ca2f2587106fb193d92d7f10c85139892f97a9ddd96f10bc3dd3b1f58f028
+size 85728997

checkpoint-164/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
+size 14645

checkpoint-164/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8706973e007590bfa836db471dff71dd712c2b8887a568e7dc03dcbf8f4f93e7
+size 1465

checkpoint-164/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

checkpoint-164/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}

checkpoint-164/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1707 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 50,
+  "global_step": 164,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 0.32083135563880205,
+      "epoch": 0.01225114854517611,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0002,
+      "loss": 0.019214527681469917,
+      "mean_token_accuracy": 0.9918519593775272,
+      "num_tokens": 6092.0,
+      "step": 1
+    },
+    {
+      "entropy": 0.3576695416122675,
+      "epoch": 0.02450229709035222,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00019878048780487805,
+      "loss": 0.03324645012617111,
+      "mean_token_accuracy": 0.988272774964571,
+      "num_tokens": 11535.0,
+      "step": 2
+    },
+    {
+      "entropy": 0.33352363388985395,
+      "epoch": 0.036753445635528334,
+      "grad_norm": 0.0272216796875,
+      "learning_rate": 0.0001975609756097561,
+      "loss": 0.0017091021873056889,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 16432.0,
+      "step": 3
+    },
+    {
+      "entropy": 0.35098350048065186,
+      "epoch": 0.04900459418070444,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.00019634146341463416,
+      "loss": 0.00414489908143878,
+      "mean_token_accuracy": 0.9985632188618183,
+      "num_tokens": 20507.0,
+      "step": 4
+    },
+    {
+      "entropy": 0.3005372080951929,
+      "epoch": 0.06125574272588055,
+      "grad_norm": 0.01416015625,
+      "learning_rate": 0.0001951219512195122,
+      "loss": 0.0008560216519981623,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 26122.0,
+      "step": 5
+    },
+    {
+      "entropy": 0.3177621979266405,
+      "epoch": 0.07350689127105667,
+      "grad_norm": 0.008544921875,
+      "learning_rate": 0.00019390243902439025,
+      "loss": 0.0005585744511336088,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 30847.0,
+      "step": 6
+    },
+    {
+      "entropy": 0.27754624653607607,
+      "epoch": 0.08575803981623277,
+      "grad_norm": 0.019775390625,
+      "learning_rate": 0.0001926829268292683,
+      "loss": 0.0012820134870707989,
+      "mean_token_accuracy": 0.9998413696885109,
+      "num_tokens": 36541.0,
+      "step": 7
+    },
+    {
+      "entropy": 0.30307829193770885,
+      "epoch": 0.09800918836140889,
+      "grad_norm": 0.004364013671875,
+      "learning_rate": 0.00019146341463414633,
+      "loss": 0.0003136860905215144,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 41001.0,
+      "step": 8
+    },
+    {
+      "entropy": 0.31226138956844807,
+      "epoch": 0.11026033690658499,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001902439024390244,
+      "loss": 0.006275261752307415,
+      "mean_token_accuracy": 0.9993216060101986,
+      "num_tokens": 45467.0,
+      "step": 9
+    },
+    {
+      "entropy": 0.2779384208843112,
+      "epoch": 0.1225114854517611,
+      "grad_norm": 0.011474609375,
+      "learning_rate": 0.00018902439024390244,
+      "loss": 0.0006869531353004277,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 50478.0,
+      "step": 10
+    },
+    {
+      "entropy": 0.27587867714464664,
+      "epoch": 0.13476263399693722,
+      "grad_norm": 0.00188446044921875,
+      "learning_rate": 0.0001878048780487805,
+      "loss": 0.0001916390028782189,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 56181.0,
+      "step": 11
+    },
+    {
+      "entropy": 0.2948900917544961,
+      "epoch": 0.14701378254211334,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.00018658536585365856,
+      "loss": 0.001886777114123106,
+      "mean_token_accuracy": 0.9998650103807449,
+      "num_tokens": 62946.0,
+      "step": 12
+    },
+    {
+      "entropy": 0.29555963445454836,
+      "epoch": 0.15926493108728942,
+      "grad_norm": 0.005523681640625,
+      "learning_rate": 0.0001853658536585366,
+      "loss": 0.00017441912495996803,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 68436.0,
+      "step": 13
+    },
+    {
+      "entropy": 0.287986209616065,
+      "epoch": 0.17151607963246554,
+      "grad_norm": 0.02001953125,
+      "learning_rate": 0.00018414634146341464,
+      "loss": 0.00017802949878387153,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 73603.0,
+      "step": 14
+    },
+    {
+      "entropy": 0.3127295421436429,
+      "epoch": 0.18376722817764166,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001829268292682927,
+      "loss": 0.0010371531825512648,
+      "mean_token_accuracy": 0.9995941556990147,
+      "num_tokens": 77845.0,
+      "step": 15
+    },
+    {
+      "entropy": 0.2922206539660692,
+      "epoch": 0.19601837672281777,
+      "grad_norm": 0.00118255615234375,
+      "learning_rate": 0.00018170731707317075,
+      "loss": 0.00011905122664757073,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 82744.0,
+      "step": 16
+    },
+    {
+      "entropy": 0.2928574001416564,
+      "epoch": 0.2082695252679939,
+      "grad_norm": 0.0003719329833984375,
+      "learning_rate": 0.0001804878048780488,
+      "loss": 7.616190850967541e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 87453.0,
+      "step": 17
+    },
+    {
+      "entropy": 0.2979039028286934,
+      "epoch": 0.22052067381316998,
+      "grad_norm": 0.0026702880859375,
+      "learning_rate": 0.00017926829268292684,
+      "loss": 0.00012367898307275027,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 92321.0,
+      "step": 18
+    },
+    {
+      "entropy": 0.31858293898403645,
+      "epoch": 0.2327718223583461,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.00017804878048780488,
+      "loss": 0.0006579139153473079,
+      "mean_token_accuracy": 0.9997499994933605,
+      "num_tokens": 97146.0,
+      "step": 19
+    },
+    {
+      "entropy": 0.30853591673076153,
+      "epoch": 0.2450229709035222,
+      "grad_norm": 0.004364013671875,
+      "learning_rate": 0.00017682926829268295,
+      "loss": 0.00014281428593676537,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 101943.0,
+      "step": 20
+    },
+    {
+      "entropy": 0.34037051256746054,
+      "epoch": 0.2572741194486983,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.000175609756097561,
+      "loss": 0.011726096272468567,
+      "mean_token_accuracy": 0.9993422217667103,
+      "num_tokens": 106772.0,
+      "step": 21
+    },
+    {
+      "entropy": 0.29644382931292057,
+      "epoch": 0.26952526799387444,
+      "grad_norm": 0.0023193359375,
+      "learning_rate": 0.00017439024390243903,
+      "loss": 0.00010672100324882194,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 112558.0,
+      "step": 22
+    },
+    {
+      "entropy": 0.3180191367864609,
+      "epoch": 0.28177641653905056,
+      "grad_norm": 0.000675201416015625,
+      "learning_rate": 0.00017317073170731708,
+      "loss": 9.894849790725857e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 117489.0,
+      "step": 23
+    },
+    {
+      "entropy": 0.32946281880140305,
+      "epoch": 0.29402756508422667,
+      "grad_norm": 0.0242919921875,
+      "learning_rate": 0.00017195121951219512,
+      "loss": 0.0029232932720333338,
+      "mean_token_accuracy": 0.9996279776096344,
+      "num_tokens": 123010.0,
+      "step": 24
+    },
+    {
+      "entropy": 0.3180750487372279,
+      "epoch": 0.30627871362940273,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.0001707317073170732,
+      "loss": 0.0015810562763363123,
+      "mean_token_accuracy": 0.9990344606339931,
+      "num_tokens": 127716.0,
+      "step": 25
+    },
+    {
+      "entropy": 0.31262985058128834,
+      "epoch": 0.31852986217457885,
+      "grad_norm": 0.0027313232421875,
+      "learning_rate": 0.00016951219512195123,
+      "loss": 0.00019670175970532,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 132372.0,
+      "step": 26
+    },
+    {
+      "entropy": 0.2831157138571143,
+      "epoch": 0.33078101071975496,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00016829268292682927,
+      "loss": 0.003187144873663783,
+      "mean_token_accuracy": 0.9994877055287361,
+      "num_tokens": 137028.0,
+      "step": 27
+    },
+    {
+      "entropy": 0.3106652954593301,
+      "epoch": 0.3430321592649311,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.00016707317073170731,
+      "loss": 0.004998125601559877,
+      "mean_token_accuracy": 0.9980670101940632,
+      "num_tokens": 142088.0,
+      "step": 28
+    },
+    {
+      "entropy": 0.31454288959503174,
+      "epoch": 0.3552833078101072,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.00016585365853658536,
+      "loss": 0.000461318384623155,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 147481.0,
+      "step": 29
+    },
+    {
+      "entropy": 0.33650430012494326,
+      "epoch": 0.3675344563552833,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.00016463414634146343,
+      "loss": 0.0005614800029434264,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 152973.0,
+      "step": 30
+    },
+    {
+      "entropy": 0.33513325452804565,
+      "epoch": 0.37978560490045943,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 0.00016341463414634147,
+      "loss": 0.00020872258755844086,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 156786.0,
+      "step": 31
+    },
+    {
+      "entropy": 0.34442581795156,
+      "epoch": 0.39203675344563554,
+      "grad_norm": 0.0159912109375,
+      "learning_rate": 0.00016219512195121954,
+      "loss": 0.00043797443504445255,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 162859.0,
+      "step": 32
+    },
+    {
+      "entropy": 0.34709672816097736,
+      "epoch": 0.40428790199081166,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.00016097560975609758,
+      "loss": 0.0008612321689724922,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 167969.0,
+      "step": 33
+    },
+    {
+      "entropy": 0.31636961828917265,
+      "epoch": 0.4165390505359878,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.00015975609756097562,
+      "loss": 0.001623529358766973,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 172518.0,
+      "step": 34
+    },
+    {
+      "entropy": 0.341240718960762,
+      "epoch": 0.42879019908116384,
+      "grad_norm": 0.0089111328125,
+      "learning_rate": 0.00015853658536585366,
+      "loss": 0.0004598334198817611,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 177085.0,
+      "step": 35
+    },
+    {
+      "entropy": 0.3331515807658434,
+      "epoch": 0.44104134762633995,
+      "grad_norm": 0.0137939453125,
+      "learning_rate": 0.00015731707317073173,
+      "loss": 0.00047711117076687515,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 181617.0,
+      "step": 36
+    },
+    {
+      "entropy": 0.2969168536365032,
+      "epoch": 0.45329249617151607,
+      "grad_norm": 0.0296630859375,
+      "learning_rate": 0.00015609756097560978,
+      "loss": 0.0018673602025955915,
+      "mean_token_accuracy": 0.9982142858207226,
+      "num_tokens": 186836.0,
+      "step": 37
+    },
+    {
+      "entropy": 0.3208611598238349,
+      "epoch": 0.4655436447166922,
+      "grad_norm": 0.0034027099609375,
+      "learning_rate": 0.00015487804878048782,
+      "loss": 0.00018661899957805872,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 191224.0,
+      "step": 38
+    },
+    {
+      "entropy": 0.296407300978899,
+      "epoch": 0.4777947932618683,
+      "grad_norm": 0.003570556640625,
+      "learning_rate": 0.00015365853658536586,
+      "loss": 0.0001632017083466053,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 195926.0,
+      "step": 39
+    },
+    {
+      "entropy": 0.32142599392682314,
+      "epoch": 0.4900459418070444,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001524390243902439,
+      "loss": 0.0039696223102509975,
+      "mean_token_accuracy": 0.9992866478860378,
+      "num_tokens": 200772.0,
+      "step": 40
+    },
+    {
+      "entropy": 0.3037592498585582,
+      "epoch": 0.5022970903522205,
+      "grad_norm": 0.0026092529296875,
+      "learning_rate": 0.00015121951219512197,
+      "loss": 0.00013867147208657116,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 204499.0,
+      "step": 41
+    },
+    {
+      "entropy": 0.31665132474154234,
+      "epoch": 0.5145482388973966,
+      "grad_norm": 0.004730224609375,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.00025882094632834196,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 208814.0,
+      "step": 42
+    },
+    {
+      "entropy": 0.33023010194301605,
+      "epoch": 0.5267993874425727,
+      "grad_norm": 0.001922607421875,
+      "learning_rate": 0.00014878048780487806,
+      "loss": 0.00019074659212492406,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 213907.0,
+      "step": 43
+    },
+    {
+      "entropy": 0.334543508477509,
+      "epoch": 0.5390505359877489,
+      "grad_norm": 0.0018157958984375,
+      "learning_rate": 0.0001475609756097561,
+      "loss": 0.00011566472676349804,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 218988.0,
+      "step": 44
+    },
+    {
+      "entropy": 0.3078083451837301,
+      "epoch": 0.5513016845329249,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.00014634146341463414,
+      "loss": 0.0022110757417976856,
+      "mean_token_accuracy": 0.9987903237342834,
+      "num_tokens": 223595.0,
+      "step": 45
+    },
+    {
+      "entropy": 0.32667472772300243,
+      "epoch": 0.5635528330781011,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001451219512195122,
+      "loss": 0.0010719874408096075,
+      "mean_token_accuracy": 0.9991953931748867,
+      "num_tokens": 228244.0,
+      "step": 46
+    },
+    {
+      "entropy": 0.3273861287161708,
+      "epoch": 0.5758039816232772,
+      "grad_norm": 0.00057220458984375,
+      "learning_rate": 0.00014390243902439025,
+      "loss": 6.594268779736012e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 232606.0,
+      "step": 47
+    },
+    {
+      "entropy": 0.31728990003466606,
+      "epoch": 0.5880551301684533,
+      "grad_norm": 0.0003185272216796875,
+      "learning_rate": 0.0001426829268292683,
+      "loss": 8.574798266636208e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 236563.0,
+      "step": 48
+    },
+    {
+      "entropy": 0.34826087579131126,
+      "epoch": 0.6003062787136294,
+      "grad_norm": 0.00390625,
+      "learning_rate": 0.00014146341463414634,
+      "loss": 0.00015243196685332805,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 241214.0,
+      "step": 49
+    },
+    {
+      "entropy": 0.3367287954315543,
+      "epoch": 0.6125574272588055,
+      "grad_norm": 0.003265380859375,
+      "learning_rate": 0.00014024390243902438,
+      "loss": 0.0001341242023045197,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 245200.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.6125574272588055,
+      "eval_entropy": 0.3212364659361217,
+      "eval_loss": 0.0014040147652849555,
+      "eval_mean_token_accuracy": 0.9998166846192401,
+      "eval_num_tokens": 245200.0,
+      "eval_runtime": 51.1353,
+      "eval_samples_per_second": 1.349,
+      "eval_steps_per_second": 1.349,
+      "step": 50
+    },
+    {
+      "entropy": 0.3274610061198473,
+      "epoch": 0.6248085758039816,
+      "grad_norm": 0.000518798828125,
+      "learning_rate": 0.00013902439024390245,
+      "loss": 6.213193410076201e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 249761.0,
+      "step": 51
+    },
+    {
+      "entropy": 0.3302043145522475,
+      "epoch": 0.6370597243491577,
+      "grad_norm": 0.00067901611328125,
+      "learning_rate": 0.0001378048780487805,
+      "loss": 7.391967665171251e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 254787.0,
+      "step": 52
+    },
+    {
+      "entropy": 0.3345805983990431,
+      "epoch": 0.6493108728943339,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.00013658536585365856,
+      "loss": 0.008045142516493797,
+      "mean_token_accuracy": 0.9975476562976837,
+      "num_tokens": 260287.0,
+      "step": 53
+    },
+    {
+      "entropy": 0.3093695640563965,
+      "epoch": 0.6615620214395099,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001353658536585366,
+      "loss": 0.0016300748102366924,
+      "mean_token_accuracy": 0.9998249299824238,
+      "num_tokens": 264810.0,
+      "step": 54
+    },
+    {
+      "entropy": 0.33090174850076437,
+      "epoch": 0.6738131699846861,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.00013414634146341464,
+      "loss": 0.0037348291371017694,
+      "mean_token_accuracy": 0.9990433678030968,
+      "num_tokens": 270386.0,
+      "step": 55
+    },
+    {
+      "entropy": 0.3455248447135091,
+      "epoch": 0.6860643185298622,
+      "grad_norm": 0.0301513671875,
+      "learning_rate": 0.0001329268292682927,
+      "loss": 0.0006253286846913397,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 274391.0,
+      "step": 56
+    },
+    {
+      "entropy": 0.3408086858689785,
+      "epoch": 0.6983154670750383,
+      "grad_norm": 0.0033111572265625,
+      "learning_rate": 0.00013170731707317076,
+      "loss": 0.00020847572886850685,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 279716.0,
+      "step": 57
+    },
+    {
+      "entropy": 0.29423840064555407,
+      "epoch": 0.7105666156202144,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001304878048780488,
+      "loss": 0.005600863602012396,
+      "mean_token_accuracy": 0.998680267482996,
+      "num_tokens": 285404.0,
+      "step": 58
+    },
+    {
+      "entropy": 0.33689095824956894,
+      "epoch": 0.7228177641653905,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.00012926829268292684,
+      "loss": 0.009100214578211308,
+      "mean_token_accuracy": 0.9967310577630997,
+      "num_tokens": 290021.0,
+      "step": 59
+    },
+    {
+      "entropy": 0.3336018780246377,
+      "epoch": 0.7350689127105666,
+      "grad_norm": 0.005889892578125,
+      "learning_rate": 0.00012804878048780488,
+      "loss": 0.00015729578444734216,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 294890.0,
+      "step": 60
+    },
+    {
+      "entropy": 0.30060291569679976,
+      "epoch": 0.7473200612557427,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.00012682926829268293,
+      "loss": 0.00039864826248958707,
+      "mean_token_accuracy": 0.9993686862289906,
+      "num_tokens": 300384.0,
+      "step": 61
+    },
+    {
+      "entropy": 0.36021818965673447,
+      "epoch": 0.7595712098009189,
+      "grad_norm": 0.0025634765625,
+      "learning_rate": 0.000125609756097561,
+      "loss": 0.00016568033606745303,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 305805.0,
+      "step": 62
+    },
+    {
+      "entropy": 0.32536453381180763,
+      "epoch": 0.7718223583460949,
+      "grad_norm": 0.001800537109375,
+      "learning_rate": 0.00012439024390243904,
+      "loss": 0.00014585268218070269,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 310233.0,
+      "step": 63
+    },
+    {
+      "entropy": 0.31967335008084774,
+      "epoch": 0.7840735068912711,
+      "grad_norm": 0.0010223388671875,
+      "learning_rate": 0.00012317073170731708,
+      "loss": 0.00010060967179015279,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 314234.0,
+      "step": 64
+    },
+    {
+      "entropy": 0.34358128905296326,
+      "epoch": 0.7963246554364471,
+      "grad_norm": 0.000743865966796875,
+      "learning_rate": 0.00012195121951219512,
+      "loss": 9.478208812652156e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 319186.0,
+      "step": 65
+    },
+    {
+      "entropy": 0.33988895174115896,
+      "epoch": 0.8085758039816233,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.00012073170731707318,
+      "loss": 0.0011607923079282045,
+      "mean_token_accuracy": 0.9995629377663136,
+      "num_tokens": 324710.0,
+      "step": 66
+    },
+    {
+      "entropy": 0.3078791871666908,
+      "epoch": 0.8208269525267994,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.00011951219512195122,
+      "loss": 0.016102174296975136,
+      "mean_token_accuracy": 0.9935315921902657,
+      "num_tokens": 329942.0,
+      "step": 67
+    },
+    {
+      "entropy": 0.3587793167680502,
+      "epoch": 0.8330781010719756,
+      "grad_norm": 0.002716064453125,
+      "learning_rate": 0.00011829268292682926,
+      "loss": 0.0001911829021992162,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 334487.0,
+      "step": 68
+    },
+    {
+      "entropy": 0.360817888751626,
+      "epoch": 0.8453292496171516,
+      "grad_norm": 0.003753662109375,
+      "learning_rate": 0.00011707317073170732,
+      "loss": 0.00026575953233987093,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 338184.0,
+      "step": 69
+    },
+    {
+      "entropy": 0.3788213599473238,
+      "epoch": 0.8575803981623277,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.00011585365853658536,
+      "loss": 0.007251895032823086,
+      "mean_token_accuracy": 0.997805867344141,
+      "num_tokens": 342594.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.37989665009081364,
+      "epoch": 0.8698315467075038,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.00011463414634146342,
+      "loss": 0.001519644632935524,
+      "mean_token_accuracy": 0.9997807033360004,
+      "num_tokens": 347798.0,
+      "step": 71
+    },
+    {
+      "entropy": 0.35538383200764656,
+      "epoch": 0.8820826952526799,
+      "grad_norm": 0.0038604736328125,
+      "learning_rate": 0.00011341463414634146,
+      "loss": 0.00030194621649570763,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 352122.0,
+      "step": 72
+    },
+    {
+      "entropy": 0.36578258499503136,
+      "epoch": 0.8943338437978561,
+      "grad_norm": 0.02001953125,
+      "learning_rate": 0.00011219512195121953,
+      "loss": 0.0018432819051668048,
+      "mean_token_accuracy": 0.9997568093240261,
+      "num_tokens": 357944.0,
+      "step": 73
+    },
+    {
+      "entropy": 0.3363148244097829,
+      "epoch": 0.9065849923430321,
+      "grad_norm": 0.01214599609375,
+      "learning_rate": 0.00011097560975609757,
+      "loss": 0.0004945008549839258,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 363815.0,
+      "step": 74
+    },
+    {
+      "entropy": 0.3567014401778579,
+      "epoch": 0.9188361408882083,
+      "grad_norm": 0.00160980224609375,
+      "learning_rate": 0.00010975609756097563,
+      "loss": 0.0002087215252686292,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 368871.0,
+      "step": 75
+    },
+    {
+      "entropy": 0.3798025632277131,
+      "epoch": 0.9310872894333844,
+      "grad_norm": 0.0242919921875,
+      "learning_rate": 0.00010853658536585367,
+      "loss": 0.0011810146970674396,
+      "mean_token_accuracy": 0.999143835157156,
+      "num_tokens": 373671.0,
+      "step": 76
+    },
+    {
+      "entropy": 0.3385667558759451,
+      "epoch": 0.9433384379785605,
+      "grad_norm": 0.00164031982421875,
+      "learning_rate": 0.00010731707317073172,
+      "loss": 0.00021391667542047799,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 379038.0,
+      "step": 77
+    },
+    {
+      "entropy": 0.37137152813374996,
+      "epoch": 0.9555895865237366,
+      "grad_norm": 0.0194091796875,
+      "learning_rate": 0.00010609756097560977,
+      "loss": 0.0009015509858727455,
+      "mean_token_accuracy": 0.9992977529764175,
+      "num_tokens": 384253.0,
+      "step": 78
+    },
+    {
+      "entropy": 0.35634181648492813,
+      "epoch": 0.9678407350689127,
+      "grad_norm": 0.002349853515625,
+      "learning_rate": 0.00010487804878048781,
+      "loss": 0.0003007323248311877,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 388348.0,
+      "step": 79
+    },
+    {
+      "entropy": 0.3363165808841586,
+      "epoch": 0.9800918836140888,
+      "grad_norm": 0.013916015625,
+      "learning_rate": 0.00010365853658536586,
+      "loss": 0.0015124119818210602,
+      "mean_token_accuracy": 0.999507874250412,
+      "num_tokens": 394214.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.34769035689532757,
+      "epoch": 0.9923430321592649,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001024390243902439,
+      "loss": 0.0008837911300361156,
+      "mean_token_accuracy": 0.9992187507450581,
+      "num_tokens": 399114.0,
+      "step": 81
+    },
+    {
+      "entropy": 0.34723484665155413,
+      "epoch": 1.0,
+      "grad_norm": 0.002288818359375,
+      "learning_rate": 0.00010121951219512196,
+      "loss": 0.0002318796032341197,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 402130.0,
+      "step": 82
+    },
+    {
+      "entropy": 0.3677198924124241,
+      "epoch": 1.0122511485451762,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0028767124749720097,
+      "mean_token_accuracy": 0.9997509978711605,
+      "num_tokens": 406761.0,
+      "step": 83
+    },
+    {
+      "entropy": 0.3296260507777333,
+      "epoch": 1.0245022970903521,
+      "grad_norm": 0.0016326904296875,
+      "learning_rate": 9.878048780487805e-05,
+      "loss": 0.00020801745995413512,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 411367.0,
+      "step": 84
+    },
+    {
+      "entropy": 0.36815651040524244,
+      "epoch": 1.0367534456355283,
+      "grad_norm": 0.00299072265625,
+      "learning_rate": 9.75609756097561e-05,
+      "loss": 0.00034169916762039065,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 417768.0,
+      "step": 85
+    },
+    {
+      "entropy": 0.33015719801187515,
+      "epoch": 1.0490045941807045,
+      "grad_norm": 0.0019683837890625,
+      "learning_rate": 9.634146341463415e-05,
+      "loss": 0.0002285851223859936,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 421738.0,
+      "step": 86
+    },
+    {
+      "entropy": 0.33297139778733253,
+      "epoch": 1.0612557427258806,
+      "grad_norm": 0.0003604888916015625,
+      "learning_rate": 9.51219512195122e-05,
+      "loss": 0.00012145948858233169,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 426854.0,
+      "step": 87
+    },
+    {
+      "entropy": 0.4070947393774986,
+      "epoch": 1.0735068912710566,
+      "grad_norm": 0.017333984375,
+      "learning_rate": 9.390243902439024e-05,
+      "loss": 0.0016109611606225371,
+      "mean_token_accuracy": 0.9998486675322056,
+      "num_tokens": 431083.0,
+      "step": 88
+    },
+    {
+      "entropy": 0.3781026881188154,
+      "epoch": 1.0857580398162328,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 9.26829268292683e-05,
+      "loss": 0.003159651067107916,
+      "mean_token_accuracy": 0.9989801794290543,
+      "num_tokens": 435694.0,
+      "step": 89
+    },
+    {
+      "entropy": 0.3439221568405628,
+      "epoch": 1.098009188361409,
+      "grad_norm": 0.000949859619140625,
+      "learning_rate": 9.146341463414635e-05,
+      "loss": 0.00018103225738741457,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 440578.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.38779534585773945,
+      "epoch": 1.110260336906585,
+      "grad_norm": 0.0142822265625,
+      "learning_rate": 9.02439024390244e-05,
+      "loss": 0.002015941310673952,
+      "mean_token_accuracy": 0.9984939768910408,
+      "num_tokens": 445238.0,
+      "step": 91
+    },
+    {
+      "entropy": 0.3697750475257635,
+      "epoch": 1.122511485451761,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 8.902439024390244e-05,
+      "loss": 0.006127167027443647,
+      "mean_token_accuracy": 0.9989957921206951,
+      "num_tokens": 449993.0,
+      "step": 92
+    },
+    {
+      "entropy": 0.34917816519737244,
+      "epoch": 1.1347626339969372,
+      "grad_norm": 0.0037384033203125,
+      "learning_rate": 8.78048780487805e-05,
+      "loss": 0.00024314325128216296,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 454976.0,
+      "step": 93
+    },
+    {
+      "entropy": 0.3524725306779146,
+      "epoch": 1.1470137825421134,
+      "grad_norm": 0.00104522705078125,
+      "learning_rate": 8.658536585365854e-05,
+      "loss": 0.00014462518447544426,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 459671.0,
+      "step": 94
+    },
+    {
+      "entropy": 0.3524913527071476,
+      "epoch": 1.1592649310872893,
+      "grad_norm": 0.000782012939453125,
+      "learning_rate": 8.53658536585366e-05,
+      "loss": 0.0001363266637781635,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 464310.0,
+      "step": 95
+    },
+    {
+      "entropy": 0.33474782202392817,
+      "epoch": 1.1715160796324655,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 8.414634146341464e-05,
+      "loss": 0.006995758973062038,
+      "mean_token_accuracy": 0.997385773807764,
+      "num_tokens": 468855.0,
+      "step": 96
+    },
+    {
+      "entropy": 0.34024662896990776,
+      "epoch": 1.1837672281776417,
+      "grad_norm": 0.000762939453125,
+      "learning_rate": 8.292682926829268e-05,
+      "loss": 0.00012206919927848503,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 473729.0,
+      "step": 97
+    },
+    {
+      "entropy": 0.35474758967757225,
+      "epoch": 1.1960183767228179,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 8.170731707317073e-05,
+      "loss": 0.0028819667641073465,
+      "mean_token_accuracy": 0.9993131868541241,
+      "num_tokens": 479034.0,
+      "step": 98
+    },
+    {
+      "entropy": 0.3854726795107126,
+      "epoch": 1.2082695252679938,
+      "grad_norm": 0.00046539306640625,
+      "learning_rate": 8.048780487804879e-05,
+      "loss": 9.724850679049268e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 484808.0,
+      "step": 99
+    },
+    {
+      "entropy": 0.31455889251083136,
+      "epoch": 1.22052067381317,
+      "grad_norm": 0.00958251953125,
+      "learning_rate": 7.926829268292683e-05,
+      "loss": 0.0009833230869844556,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 489519.0,
+      "step": 100
+    },
+    {
+      "epoch": 1.22052067381317,
+      "eval_entropy": 0.3496412036643512,
+      "eval_loss": 0.0005010219174437225,
+      "eval_mean_token_accuracy": 0.9998490343923154,
+      "eval_num_tokens": 489519.0,
+      "eval_runtime": 51.1698,
+      "eval_samples_per_second": 1.348,
+      "eval_steps_per_second": 1.348,
+      "step": 100
+    },
+    {
+      "entropy": 0.36140021588653326,
+      "epoch": 1.2327718223583461,
+      "grad_norm": 0.0006256103515625,
+      "learning_rate": 7.804878048780489e-05,
+      "loss": 0.00011641360470093787,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 494754.0,
+      "step": 101
+    },
+    {
+      "entropy": 0.33879768289625645,
+      "epoch": 1.245022970903522,
+      "grad_norm": 0.00037384033203125,
+      "learning_rate": 7.682926829268293e-05,
+      "loss": 0.00010185636347159743,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 499834.0,
+      "step": 102
+    },
+    {
+      "entropy": 0.36160764284431934,
+      "epoch": 1.2572741194486983,
+      "grad_norm": 0.00103759765625,
+      "learning_rate": 7.560975609756099e-05,
+      "loss": 0.00012021363363601267,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 505264.0,
+      "step": 103
+    },
+    {
+      "entropy": 0.3344170628115535,
+      "epoch": 1.2695252679938744,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 7.439024390243903e-05,
+      "loss": 0.00044063289533369243,
+      "mean_token_accuracy": 0.9995915032923222,
+      "num_tokens": 510257.0,
+      "step": 104
+    },
+    {
+      "entropy": 0.36058457661420107,
+      "epoch": 1.2817764165390506,
+      "grad_norm": 0.01336669921875,
+      "learning_rate": 7.317073170731707e-05,
+      "loss": 0.0015127016231417656,
+      "mean_token_accuracy": 0.9993556700646877,
+      "num_tokens": 514490.0,
+      "step": 105
+    },
+    {
+      "entropy": 0.33314079977571964,
+      "epoch": 1.2940275650842268,
+      "grad_norm": 0.0011749267578125,
+      "learning_rate": 7.195121951219513e-05,
+      "loss": 0.00011071039625676349,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 519508.0,
+      "step": 106
+    },
+    {
+      "entropy": 0.3573821699246764,
+      "epoch": 1.3062787136294027,
+      "grad_norm": 0.0003986358642578125,
+      "learning_rate": 7.073170731707317e-05,
+      "loss": 0.00011713722778949887,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 524370.0,
+      "step": 107
+    },
+    {
+      "entropy": 0.3524222169071436,
+      "epoch": 1.318529862174579,
+      "grad_norm": 0.0003108978271484375,
+      "learning_rate": 6.951219512195122e-05,
+      "loss": 9.721294190967456e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 528970.0,
+      "step": 108
+    },
+    {
+      "entropy": 0.3544369339942932,
+      "epoch": 1.3307810107197549,
+      "grad_norm": 0.005950927734375,
+      "learning_rate": 6.829268292682928e-05,
+      "loss": 0.0003032644744962454,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 533938.0,
+      "step": 109
+    },
+    {
+      "entropy": 0.3304135613143444,
+      "epoch": 1.343032159264931,
+      "grad_norm": 0.000965118408203125,
+      "learning_rate": 6.707317073170732e-05,
+      "loss": 0.00012454115494620055,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 539360.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.3306180518120527,
+      "epoch": 1.3552833078101072,
+      "grad_norm": 0.0011444091796875,
+      "learning_rate": 6.585365853658538e-05,
+      "loss": 0.00013282139843795449,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 543728.0,
+      "step": 111
+    },
+    {
+      "entropy": 0.3708817586302757,
+      "epoch": 1.3675344563552834,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 6.463414634146342e-05,
+      "loss": 0.004361060913652182,
+      "mean_token_accuracy": 0.9983282573521137,
+      "num_tokens": 548161.0,
+      "step": 112
+    },
+    {
+      "entropy": 0.35475645773112774,
+      "epoch": 1.3797856049004595,
+      "grad_norm": 0.01361083984375,
+      "learning_rate": 6.341463414634146e-05,
+      "loss": 0.0014049941673874855,
+      "mean_token_accuracy": 0.998511902987957,
+      "num_tokens": 553690.0,
+      "step": 113
+    },
+    {
+      "entropy": 0.3360502114519477,
+      "epoch": 1.3920367534456355,
+      "grad_norm": 0.00023746490478515625,
+      "learning_rate": 6.219512195121952e-05,
+      "loss": 8.739449549466372e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 558474.0,
+      "step": 114
+    },
+    {
+      "entropy": 0.35608484130352736,
+      "epoch": 1.4042879019908117,
+      "grad_norm": 0.0009765625,
+      "learning_rate": 6.097560975609756e-05,
+      "loss": 0.00013572419993579388,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 563962.0,
+      "step": 115
+    },
+    {
+      "entropy": 0.3591584851965308,
+      "epoch": 1.4165390505359878,
+      "grad_norm": 0.00103759765625,
+      "learning_rate": 5.975609756097561e-05,
+      "loss": 0.0001251319336006418,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 568300.0,
+      "step": 116
+    },
+    {
+      "entropy": 0.32333058025687933,
+      "epoch": 1.4287901990811638,
+      "grad_norm": 0.0002803802490234375,
+      "learning_rate": 5.853658536585366e-05,
+      "loss": 8.771298598730937e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 572892.0,
+      "step": 117
+    },
+    {
+      "entropy": 0.3675775118172169,
+      "epoch": 1.44104134762634,
+      "grad_norm": 0.0014495849609375,
+      "learning_rate": 5.731707317073171e-05,
+      "loss": 0.00014175268006511033,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 577889.0,
+      "step": 118
+    },
+    {
+      "entropy": 0.37294205371290445,
+      "epoch": 1.4532924961715161,
+      "grad_norm": 0.00099945068359375,
+      "learning_rate": 5.6097560975609764e-05,
+      "loss": 8.949499897425994e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 583125.0,
+      "step": 119
+    },
+    {
+      "entropy": 0.3598701385781169,
+      "epoch": 1.4655436447166923,
+      "grad_norm": 0.006011962890625,
+      "learning_rate": 5.487804878048781e-05,
+      "loss": 0.00018555490532889962,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 587853.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.3222861588001251,
+      "epoch": 1.4777947932618682,
+      "grad_norm": 0.0174560546875,
+      "learning_rate": 5.365853658536586e-05,
+      "loss": 0.0032859183847904205,
+      "mean_token_accuracy": 0.9993932023644447,
+      "num_tokens": 592286.0,
+      "step": 121
+    },
+    {
+      "entropy": 0.3423085901886225,
+      "epoch": 1.4900459418070444,
+      "grad_norm": 0.000354766845703125,
+      "learning_rate": 5.2439024390243904e-05,
+      "loss": 9.40198588068597e-05,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 597048.0,
+      "step": 122
+    },
+    {
+      "entropy": 0.3356657065451145,
+      "epoch": 1.5022970903522204,
+      "grad_norm": 0.00177764892578125,
+      "learning_rate": 5.121951219512195e-05,
+      "loss": 0.00018200451449956745,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 601352.0,
+      "step": 123
+    },
+    {
+      "entropy": 0.34760472923517227,
+      "epoch": 1.5145482388973965,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 5e-05,
+      "loss": 0.0016977301565930247,
+      "mean_token_accuracy": 0.9993686862289906,
+      "num_tokens": 606645.0,
+      "step": 124
+    },
+    {
+      "entropy": 0.34292006585747004,
+      "epoch": 1.5267993874425727,
+      "grad_norm": 0.00048828125,
+      "learning_rate": 4.878048780487805e-05,
+      "loss": 0.00011081612319685519,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 612407.0,
+      "step": 125
+    },
+    {
+      "entropy": 0.3405891256406903,
+      "epoch": 1.5390505359877489,
+      "grad_norm": 0.0098876953125,
+      "learning_rate": 4.75609756097561e-05,
+      "loss": 0.0002546444011386484,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 617229.0,
+      "step": 126
+    },
+    {
+      "entropy": 0.39804220758378506,
+      "epoch": 1.551301684532925,
+      "grad_norm": 0.00177001953125,
+      "learning_rate": 4.634146341463415e-05,
+      "loss": 0.00020191296061966568,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 622355.0,
+      "step": 127
+    },
+    {
+      "entropy": 0.38183566741645336,
+      "epoch": 1.5635528330781012,
+      "grad_norm": 0.0020294189453125,
+      "learning_rate": 4.51219512195122e-05,
+      "loss": 0.0002027210284722969,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 627269.0,
+      "step": 128
+    },
+    {
+      "entropy": 0.32283751480281353,
+      "epoch": 1.5758039816232772,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 4.390243902439025e-05,
+      "loss": 0.0007472627912648022,
+      "mean_token_accuracy": 0.9991987161338329,
+      "num_tokens": 631454.0,
+      "step": 129
+    },
+    {
+      "entropy": 0.31161691434681416,
+      "epoch": 1.5880551301684533,
+      "grad_norm": 0.00174713134765625,
+      "learning_rate": 4.26829268292683e-05,
+      "loss": 0.0001439937186660245,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 636502.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.3435830660164356,
+      "epoch": 1.6003062787136293,
+      "grad_norm": 0.0308837890625,
+      "learning_rate": 4.146341463414634e-05,
+      "loss": 0.004759644623845816,
+      "mean_token_accuracy": 0.9986401423811913,
+      "num_tokens": 641264.0,
+      "step": 131
+    },
+    {
+      "entropy": 0.35103026777505875,
+      "epoch": 1.6125574272588055,
+      "grad_norm": 0.014404296875,
+      "learning_rate": 4.0243902439024395e-05,
+      "loss": 0.002162080956622958,
+      "mean_token_accuracy": 0.9997351691126823,
+      "num_tokens": 646377.0,
+      "step": 132
+    },
+    {
+      "entropy": 0.2977801924571395,
+      "epoch": 1.6248085758039816,
+      "grad_norm": 0.0003948211669921875,
+      "learning_rate": 3.9024390243902444e-05,
+      "loss": 0.00010242296411888674,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 650767.0,
+      "step": 133
+    },
+    {
+      "entropy": 0.3230333384126425,
+      "epoch": 1.6370597243491578,
+      "grad_norm": 0.00138092041015625,
+      "learning_rate": 3.780487804878049e-05,
+      "loss": 0.00015076796989887953,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 655169.0,
+      "step": 134
+    },
+    {
+      "entropy": 0.341650640591979,
+      "epoch": 1.649310872894334,
+      "grad_norm": 0.000942230224609375,
+      "learning_rate": 3.6585365853658535e-05,
+      "loss": 0.00014208458014763892,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 660290.0,
+      "step": 135
+    },
+    {
+      "entropy": 0.3829786740243435,
+      "epoch": 1.66156202143951,
+      "grad_norm": 0.00069427490234375,
+      "learning_rate": 3.5365853658536584e-05,
+      "loss": 0.00014442864630836993,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 664473.0,
+      "step": 136
+    },
+    {
+      "entropy": 0.36254822462797165,
+      "epoch": 1.673813169984686,
+      "grad_norm": 0.000873565673828125,
+      "learning_rate": 3.414634146341464e-05,
+      "loss": 0.00012407865142449737,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 669356.0,
+      "step": 137
+    },
+    {
+      "entropy": 0.3526885788887739,
+      "epoch": 1.686064318529862,
+      "grad_norm": 0.01544189453125,
+      "learning_rate": 3.292682926829269e-05,
+      "loss": 0.0013645780272781849,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 674911.0,
+      "step": 138
+    },
+    {
+      "entropy": 0.3426882065832615,
+      "epoch": 1.6983154670750382,
+      "grad_norm": 0.00136566162109375,
+      "learning_rate": 3.170731707317073e-05,
+      "loss": 0.00017942595877684653,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 679692.0,
+      "step": 139
+    },
+    {
+      "entropy": 0.36831479519605637,
+      "epoch": 1.7105666156202144,
+      "grad_norm": 0.01104736328125,
+      "learning_rate": 3.048780487804878e-05,
+      "loss": 0.00024098601716104895,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 685048.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.3340944442898035,
+      "epoch": 1.7228177641653906,
+      "grad_norm": 0.000865936279296875,
+      "learning_rate": 2.926829268292683e-05,
+      "loss": 0.00013921498612035066,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 689396.0,
+      "step": 141
+    },
+    {
+      "entropy": 0.34801830537617207,
+      "epoch": 1.7350689127105667,
+      "grad_norm": 0.000965118408203125,
+      "learning_rate": 2.8048780487804882e-05,
+      "loss": 0.0001655905944062397,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 693189.0,
+      "step": 142
+    },
+    {
+      "entropy": 0.35556044429540634,
+      "epoch": 1.7473200612557427,
+      "grad_norm": 0.0019073486328125,
+      "learning_rate": 2.682926829268293e-05,
+      "loss": 0.00019044376676902175,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 697603.0,
+      "step": 143
+    },
+    {
+      "entropy": 0.3632572125643492,
+      "epoch": 1.7595712098009189,
+      "grad_norm": 0.00106048583984375,
+      "learning_rate": 2.5609756097560977e-05,
+      "loss": 0.00017029076116159558,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 703050.0,
+      "step": 144
+    },
+    {
+      "entropy": 0.35750158317387104,
+      "epoch": 1.7718223583460948,
+      "grad_norm": 0.0130615234375,
+      "learning_rate": 2.4390243902439026e-05,
+      "loss": 0.0015582278138026595,
+      "mean_token_accuracy": 0.999015748500824,
+      "num_tokens": 707862.0,
+      "step": 145
+    },
+    {
+      "entropy": 0.36597106605768204,
+      "epoch": 1.784073506891271,
+      "grad_norm": 0.000583648681640625,
+      "learning_rate": 2.3170731707317075e-05,
+      "loss": 0.00013483221118804067,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 712821.0,
+      "step": 146
+    },
+    {
+      "entropy": 0.35171396005898714,
+      "epoch": 1.7963246554364471,
+      "grad_norm": 0.00157928466796875,
+      "learning_rate": 2.1951219512195124e-05,
+      "loss": 0.00012708510621450841,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 718453.0,
+      "step": 147
+    },
+    {
+      "entropy": 0.3596025314182043,
+      "epoch": 1.8085758039816233,
+      "grad_norm": 0.000762939453125,
+      "learning_rate": 2.073170731707317e-05,
+      "loss": 0.00011665250349324197,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 723810.0,
+      "step": 148
+    },
+    {
+      "entropy": 0.3876404408365488,
+      "epoch": 1.8208269525267995,
+      "grad_norm": 0.00170135498046875,
+      "learning_rate": 1.9512195121951222e-05,
+      "loss": 0.00014468679728452116,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 728126.0,
+      "step": 149
+    },
+    {
+      "entropy": 0.3599753547459841,
+      "epoch": 1.8330781010719757,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 1.8292682926829268e-05,
+      "loss": 0.0008729367982596159,
+      "mean_token_accuracy": 0.9996936284005642,
+      "num_tokens": 733917.0,
+      "step": 150
+    },
+    {
+      "epoch": 1.8330781010719757,
+      "eval_entropy": 0.3504998228256253,
+      "eval_loss": 0.0005272864946164191,
+      "eval_mean_token_accuracy": 0.9998166846192401,
+      "eval_num_tokens": 733917.0,
+      "eval_runtime": 51.0847,
+      "eval_samples_per_second": 1.351,
+      "eval_steps_per_second": 1.351,
+      "step": 150
+    },
+    {
+      "entropy": 0.33359322790056467,
+      "epoch": 1.8453292496171516,
+      "grad_norm": 0.025146484375,
+      "learning_rate": 1.707317073170732e-05,
+      "loss": 0.0006189637933857739,
+      "mean_token_accuracy": 0.9997438527643681,
+      "num_tokens": 738160.0,
+      "step": 151
+    },
+    {
+      "entropy": 0.3766339849680662,
+      "epoch": 1.8575803981623276,
+      "grad_norm": 0.00141143798828125,
+      "learning_rate": 1.5853658536585366e-05,
+      "loss": 0.0001552235771669075,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 743916.0,
+      "step": 152
+    },
+    {
+      "entropy": 0.3593102600425482,
+      "epoch": 1.8698315467075037,
+      "grad_norm": 0.0005035400390625,
+      "learning_rate": 1.4634146341463415e-05,
+      "loss": 0.00010784749611048028,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 749557.0,
+      "step": 153
+    },
+    {
+      "entropy": 0.35354321263730526,
+      "epoch": 1.88208269525268,
+      "grad_norm": 0.006622314453125,
+      "learning_rate": 1.3414634146341466e-05,
+      "loss": 0.00018536817515268922,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 755349.0,
+      "step": 154
+    },
+    {
+      "entropy": 0.38418914191424847,
+      "epoch": 1.894333843797856,
+      "grad_norm": 0.0048828125,
+      "learning_rate": 1.2195121951219513e-05,
+      "loss": 0.00016054415027610958,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 760507.0,
+      "step": 155
+    },
+    {
+      "entropy": 0.36554452031850815,
+      "epoch": 1.9065849923430322,
+      "grad_norm": 0.000640869140625,
+      "learning_rate": 1.0975609756097562e-05,
+      "loss": 0.0001385942887281999,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 765415.0,
+      "step": 156
+    },
+    {
+      "entropy": 0.3568859798833728,
+      "epoch": 1.9188361408882084,
+      "grad_norm": 0.005401611328125,
+      "learning_rate": 9.756097560975611e-06,
+      "loss": 0.00035842141369357705,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 770652.0,
+      "step": 157
+    },
+    {
+      "entropy": 0.3693056581541896,
+      "epoch": 1.9310872894333844,
+      "grad_norm": 0.0006561279296875,
+      "learning_rate": 8.53658536585366e-06,
+      "loss": 0.00012641935609281063,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 775652.0,
+      "step": 158
+    },
+    {
+      "entropy": 0.3441598182544112,
+      "epoch": 1.9433384379785605,
+      "grad_norm": 0.0030364990234375,
+      "learning_rate": 7.317073170731707e-06,
+      "loss": 0.00021011351782362908,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 779850.0,
+      "step": 159
+    },
+    {
+      "entropy": 0.3504209266975522,
+      "epoch": 1.9555895865237365,
+      "grad_norm": 0.0021514892578125,
+      "learning_rate": 6.0975609756097564e-06,
+      "loss": 0.0001835815783124417,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 784524.0,
+      "step": 160
+    },
+    {
+      "entropy": 0.3750592265278101,
+      "epoch": 1.9678407350689127,
+      "grad_norm": 0.000492095947265625,
+      "learning_rate": 4.8780487804878055e-06,
+      "loss": 0.00013111173757351935,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 790219.0,
+      "step": 161
+    },
+    {
+      "entropy": 0.35673493705689907,
+      "epoch": 1.9800918836140888,
+      "grad_norm": 0.003204345703125,
+      "learning_rate": 3.6585365853658537e-06,
+      "loss": 0.00022010535758454353,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 794984.0,
+      "step": 162
+    },
+    {
+      "entropy": 0.3832458099350333,
+      "epoch": 1.992343032159265,
+      "grad_norm": 0.00061798095703125,
+      "learning_rate": 2.4390243902439027e-06,
+      "loss": 0.00012069179501850158,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 800604.0,
+      "step": 163
+    },
+    {
+      "entropy": 0.37493912875652313,
+      "epoch": 2.0,
+      "grad_norm": 0.023193359375,
+      "learning_rate": 1.2195121951219514e-06,
+      "loss": 0.002225137548521161,
+      "mean_token_accuracy": 0.999763035774231,
+      "num_tokens": 804260.0,
+      "step": 164
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 164,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.641786295631872e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-164/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e57c6e38c07ad1986e976ce1287e590ea69e06de9f14255a4dbbd33296bac1cc
+size 5585

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}