SpireLab commited on
Commit
1692e82
·
verified ·
1 Parent(s): 1ab19de

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +17 -0
  2. train_data/.gitattributes +35 -0
  3. train_data/.gitignore +1 -0
  4. train_data/README.md +101 -0
  5. train_data/muril_bh_domain/checkpoint-4000/README.md +202 -0
  6. train_data/muril_bh_domain/checkpoint-4000/adapter_config.json +32 -0
  7. train_data/muril_bh_domain/checkpoint-4000/adapter_model.safetensors +3 -0
  8. train_data/muril_bh_domain/checkpoint-4000/optimizer.pt +3 -0
  9. train_data/muril_bh_domain/checkpoint-4000/rng_state.pth +3 -0
  10. train_data/muril_bh_domain/checkpoint-4000/scheduler.pt +3 -0
  11. train_data/muril_bh_domain/checkpoint-4000/trainer_state.json +341 -0
  12. train_data/muril_bh_domain/checkpoint-4000/training_args.bin +3 -0
  13. train_data/muril_bh_domain/checkpoint-4740/README.md +202 -0
  14. train_data/muril_bh_domain/checkpoint-4740/adapter_config.json +32 -0
  15. train_data/muril_bh_domain/checkpoint-4740/adapter_model.safetensors +3 -0
  16. train_data/muril_bh_domain/checkpoint-4740/optimizer.pt +3 -0
  17. train_data/muril_bh_domain/checkpoint-4740/rng_state.pth +3 -0
  18. train_data/muril_bh_domain/checkpoint-4740/scheduler.pt +3 -0
  19. train_data/muril_bh_domain/checkpoint-4740/trainer_state.json +390 -0
  20. train_data/muril_bh_domain/checkpoint-4740/training_args.bin +3 -0
  21. train_data/muril_bh_domain/config.json +26 -0
  22. train_data/muril_bh_domain/generation_config.json +5 -0
  23. train_data/muril_bh_domain/model.safetensors +3 -0
  24. train_data/muril_bh_domain/special_tokens_map.json +7 -0
  25. train_data/muril_bh_domain/tokenizer.json +0 -0
  26. train_data/muril_bh_domain/tokenizer_config.json +58 -0
  27. train_data/muril_bh_domain/vocab.txt +0 -0
  28. train_data/muril_bn_domain/config.json +26 -0
  29. train_data/muril_bn_domain/generation_config.json +5 -0
  30. train_data/muril_bn_domain/model.safetensors +3 -0
  31. train_data/muril_bn_domain/special_tokens_map.json +7 -0
  32. train_data/muril_bn_domain/tokenizer.json +0 -0
  33. train_data/muril_bn_domain/tokenizer_config.json +58 -0
  34. train_data/muril_bn_domain/training_args.bin +3 -0
  35. train_data/muril_bn_domain/vocab.txt +0 -0
  36. train_data/muril_ch_domain/checkpoint-30500/README.md +202 -0
  37. train_data/muril_ch_domain/checkpoint-30500/adapter_config.json +34 -0
  38. train_data/muril_ch_domain/checkpoint-30500/adapter_model.safetensors +3 -0
  39. train_data/muril_ch_domain/checkpoint-30500/optimizer.pt +3 -0
  40. train_data/muril_ch_domain/checkpoint-30500/rng_state.pth +3 -0
  41. train_data/muril_ch_domain/checkpoint-30500/scheduler.pt +3 -0
  42. train_data/muril_ch_domain/checkpoint-30500/trainer_state.json +2595 -0
  43. train_data/muril_ch_domain/checkpoint-30500/training_args.bin +3 -0
  44. train_data/muril_ch_domain/checkpoint-30663/README.md +202 -0
  45. train_data/muril_ch_domain/checkpoint-30663/adapter_config.json +34 -0
  46. train_data/muril_ch_domain/checkpoint-30663/adapter_model.safetensors +3 -0
  47. train_data/muril_ch_domain/checkpoint-30663/optimizer.pt +3 -0
  48. train_data/muril_ch_domain/checkpoint-30663/rng_state.pth +3 -0
  49. train_data/muril_ch_domain/checkpoint-30663/scheduler.pt +3 -0
  50. train_data/muril_ch_domain/checkpoint-30663/trainer_state.json +2602 -0
.gitattributes CHANGED
@@ -33,3 +33,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ train_data/train_data/bn/clean_scraped_3.txt filter=lfs diff=lfs merge=lfs -text
37
+ train_data/train_data/bn/merge_bn.txt filter=lfs diff=lfs merge=lfs -text
38
+ train_data/train_data/bn/ocr_newspaper.txt filter=lfs diff=lfs merge=lfs -text
39
+ train_data/train_data/ch/raw_corpus.txt filter=lfs diff=lfs merge=lfs -text
40
+ train_data/train_data/hi/clean_scraped.txt filter=lfs diff=lfs merge=lfs -text
41
+ train_data/train_data/hi/clean_scraped_2.txt filter=lfs diff=lfs merge=lfs -text
42
+ train_data/train_data/hi/clean_scraped_3.txt filter=lfs diff=lfs merge=lfs -text
43
+ train_data/train_data/hi/merge_hi.txt filter=lfs diff=lfs merge=lfs -text
44
+ train_data/train_data/hi/ocr_newspapers.txt filter=lfs diff=lfs merge=lfs -text
45
+ train_data/train_data/kn/clean_scraped_2.txt filter=lfs diff=lfs merge=lfs -text
46
+ train_data/train_data/kn/merge_kn.txt filter=lfs diff=lfs merge=lfs -text
47
+ train_data/train_data/mag/raw_corpus_mag.txt filter=lfs diff=lfs merge=lfs -text
48
+ train_data/train_data/mr/clean_scraped.txt filter=lfs diff=lfs merge=lfs -text
49
+ train_data/train_data/mr/clean_scraped_3.txt filter=lfs diff=lfs merge=lfs -text
50
+ train_data/train_data/mr/merge_mr.txt filter=lfs diff=lfs merge=lfs -text
51
+ train_data/train_data/te/clean_scraped.txt filter=lfs diff=lfs merge=lfs -text
52
+ train_data/train_data/te/ocr_newspaper.txt filter=lfs diff=lfs merge=lfs -text
train_data/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
train_data/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .cache/
train_data/README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - hi
4
+ - bn
5
+ - te
6
+ - mr
7
+ - kn
8
+ - bho
9
+ - mai
10
+ - mag
11
+ - hne
12
+ tags:
13
+ - muril
14
+ - bert
15
+ - mlm
16
+ - low-resource
17
+ - indic-languages
18
+ license: apache-2.0
19
+ library_name: transformers
20
+ pipeline_tag: fill-mask
21
+ ---
22
+
23
+ # Fine-Tuned MuRIL Model On Indic Languages (RESPIN)
24
+
25
+ ## 1. Abstract
26
+
27
+ This document details the evaluation and performance metrics of a fine-tuned MuRIL (Multilingual Representations for Indian Languages) model. The primary objective was to adapt the pre-trained MuRIL architecture to the Indic text domain using the RESPIN (REcognizing SPeech in INdian languages) corpus.
28
+
29
+ Performance was benchmarked against the original pre-trained `google/muril-base-cased` model, the multilingual industry standard `xlm-roberta-base`, and specialized monolingual architectures (e.g., L3Cube series). The evaluation demonstrates that the fine-tuned model establishes a new state-of-the-art for low-resource languages such as Chhattisgarhi and Magahi.
30
+
31
+ ## 2. Methodology
32
+
33
+ ### 2.1 Training Corpus
34
+
35
+ The model was fine-tuned utilizing the RESPIN dataset. This corpus comprises text data acquired through extensive web crawling, alongside content derived from newspapers and books processed via Optical Character Recognition (OCR). The heterogeneous nature of these sources necessitated domain-specific adaptation to minimize perplexity (PPL) and enhance model robustness across varied linguistic contexts.
36
+
37
+ ### 2.2 Model Architecture
38
+
39
+ - **Base Architecture:** MuRIL (BERT-based encoder).
40
+ - **Modification:** Fine-tuned on the RESPIN corpus using Masked Language Modeling (MLM) objectives to align the vector space with the target distribution.
41
+
42
+ ### 2.3 Evaluation Baselines
43
+
44
+ To provide a rigorous assessment of the model's efficacy, the following comparative baselines were utilized:
45
+
46
+ - **Base MuRIL (`google/muril-base-cased`):** Used to quantify the net improvement (gain) achieved strictly through the fine-tuning process.
47
+ - **XLM-RoBERTa (`xlm-roberta-base`):** Selected as the high-capacity multilingual baseline to evaluate zero-shot performance on low-resource languages.
48
+ - **Specialist Models (`l3cube-pune`):** Monolingual models trained specifically on a single language. These represent the theoretical upper bound for performance in major languages.
49
+
50
+ ## 3. Evaluation Datasets
51
+
52
+ Testing was conducted on a diverse suite of held-out evaluation sets to ensure validity. The evaluation suite includes:
53
+
54
+ - **Benchmark Testing (Legal):** A high-complexity dataset comprising legal documents and bail applications (Available for HI, BN, TE, MR).
55
+ - **Samanantar:** A general-domain parallel corpus utilized for translation benchmarks (Available for HI, BN, KN, TE, MR).
56
+ - **RESPIN (Held-Out):** A specific split from the transcription corpus to test retention of the training distribution (Available for all languages).
57
+ - **Rural Women:** A dialect-rich dataset utilized to test robustness in Bhojpuri (BH).
58
+ - **NanoBEIR:** A retrieval benchmark dataset, cleaned for non-Devanagari artifacts, used for Maithili (MT) and Magahi (MAG).
59
+ - **Chhattisgarh TTS:** A transcription dataset used for Chhattisgarhi (CH).
60
+ - **IISc-MILE:** A speech transcription corpus used for Kannada (KN).
61
+
62
+ ## 4. Empirical Results
63
+
64
+ The following table presents the Average Perplexity (PPL) scores across the test files for each language. Perplexity is defined as the exponential of the cross-entropy loss; lower values indicate superior predictive performance.
65
+
66
+ ### Table 1: Comparative Average Perplexity (Lower is Better)
67
+
68
+ | Language Code | Language | Base MuRIL | Fine-Tuned MuRIL (Ours) | XLM-RoBERTa | Specialist Model (L3Cube) |
69
+ | :-: | :-: | :-: | :-: | :-: | :-: |
70
+ | CH | Chhattisgarhi | 764.36 | **21.73** | 169.96 | N/A |
71
+ | MAG | Magahi | 279.29 | **34.35** | 67.77 | N/A |
72
+ | BH | Bhojpuri | 444.35 | **116.41** | 122.09 | N/A |
73
+ | MT | Maithili | 598.79 | 141.09 | **90.83** | N/A |
74
+ | HI | Hindi | 27.58 | 15.86 | **10.29** | N/A |
75
+ | TE | Telugu | 62.98 | 15.41 | 12.07 | **7.43** |
76
+ | MR | Marathi | 85.71 | 25.20 | 17.71 | **19.26** |
77
+ | KN | Kannada | 172.59 | 28.87 | 15.85 | **13.23** |
78
+ | BN | Bengali | 92.62 | 207.80 | **15.96** | 33.43 |
79
+
80
+ ## 5. Analysis and Conclusion
81
+
82
+ ### 5.1 Performance on Low-Resource Languages
83
+
84
+ The most significant observation is the model's performance on extremely low-resource languages (Chhattisgarhi, Magahi, Bhojpuri), where standard multilingual models typically fail.
85
+
86
+ - **Chhattisgarhi (CH):** The Base MuRIL model exhibited a perplexity of 764.36, indicating a lack of comprehension. The Fine-Tuned model reduced this drastically to 21.73, outperforming the much larger XLM-R (169.96) by an order of magnitude.
87
+ - **Magahi (MAG):** The Fine-Tuned model achieved a perplexity of 34.35, surpassing both the Base MuRIL (279.29) and the XLM-R baseline (67.77).
88
+ - **Bhojpuri (BH):** The model demonstrated superior robustness, achieving a score of 116.41, edging out XLM-R (122.09) and vastly improving upon the Base MuRIL (444.35).
89
+
90
+ This validates the efficacy of the RESPIN dataset for adapting encoders to under-represented Indic dialects.
91
+
92
+ ### 5.2 Performance on Major Languages
93
+
94
+ For widely spoken languages (Hindi, Telugu, Marathi), the fine-tuning process yielded substantial improvements over the Base MuRIL architecture.
95
+
96
+ - **Telugu (TE) & Marathi (MR):** The Fine-Tuned model reduced perplexity by approximately 75% and 70% respectively compared to the Base model. While the specialized monolingual L3Cube models remain the leader in this category (as expected for single-language specialists), the Fine-Tuned MuRIL is a competitive multilingual alternative.
97
+ - **Bengali (BN):** The model exhibited regression in the average perplexity score (207.80). Detailed analysis reveals that while the model performed well on the respin_bn split (68.14), it struggled to generalize to the benchmark_testing and spring_inx datasets.
98
+
99
+ ### 5.3 Summary
100
+
101
+ The Fine-Tuned MuRIL model establishes a new benchmark for Chhattisgarhi and Magahi text encoding, significantly surpassing existing open-source multilingual alternatives.
train_data/muril_bh_domain/checkpoint-4000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/muril-base-cased
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
train_data/muril_bh_domain/checkpoint-4000/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "BertForMaskedLM",
5
+ "parent_library": "transformers.models.bert.modeling_bert"
6
+ },
7
+ "base_model_name_or_path": "google/muril-base-cased",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "query",
27
+ "value"
28
+ ],
29
+ "task_type": null,
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
train_data/muril_bh_domain/checkpoint-4000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deb73549679fd91090a64a88d75c687dde7b28b08950efa95d45de74872e7203
3
+ size 2366064
train_data/muril_bh_domain/checkpoint-4000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1daa4eb30785be66449e5d62b8e08a5908afcf2aa7986702f353f32bc8b42d3
3
+ size 4759290
train_data/muril_bh_domain/checkpoint-4000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26e24e2231a33bdad5887754a5b6968219ef12961d04369739f1350efee44b72
3
+ size 14244
train_data/muril_bh_domain/checkpoint-4000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edbdfc0163382607dc91e6b9699f3f3a2c3a204482ef6360e91ff42dd0f5b83a
3
+ size 1064
train_data/muril_bh_domain/checkpoint-4000/trainer_state.json ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.530844669408415,
5
+ "eval_steps": 1000,
6
+ "global_step": 4000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06327111673521038,
13
+ "grad_norm": 4.5707688331604,
14
+ "learning_rate": 1.0548523206751056e-05,
15
+ "loss": 6.0536,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.12654223347042076,
20
+ "grad_norm": 7.803924083709717,
21
+ "learning_rate": 2.1097046413502112e-05,
22
+ "loss": 5.8177,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.18981335020563114,
27
+ "grad_norm": 8.050126075744629,
28
+ "learning_rate": 3.1645569620253167e-05,
29
+ "loss": 5.0311,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.2530844669408415,
34
+ "grad_norm": 8.803975105285645,
35
+ "learning_rate": 4.2194092827004224e-05,
36
+ "loss": 4.7351,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.3163555836760519,
41
+ "grad_norm": 12.846040725708008,
42
+ "learning_rate": 4.96952648851383e-05,
43
+ "loss": 4.5453,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.3796267004112623,
48
+ "grad_norm": 15.128095626831055,
49
+ "learning_rate": 4.852320675105486e-05,
50
+ "loss": 4.3979,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.44289781714647264,
55
+ "grad_norm": 10.33956527709961,
56
+ "learning_rate": 4.7351148616971405e-05,
57
+ "loss": 4.4056,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.506168933881683,
62
+ "grad_norm": 23.287179946899414,
63
+ "learning_rate": 4.617909048288795e-05,
64
+ "loss": 4.2787,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.5694400506168934,
69
+ "grad_norm": 12.955565452575684,
70
+ "learning_rate": 4.50070323488045e-05,
71
+ "loss": 4.3504,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.6327111673521038,
76
+ "grad_norm": 10.07479476928711,
77
+ "learning_rate": 4.3834974214721055e-05,
78
+ "loss": 4.2521,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.6327111673521038,
83
+ "eval_runtime": 33.0567,
84
+ "eval_samples_per_second": 95.593,
85
+ "eval_steps_per_second": 11.949,
86
+ "step": 1000
87
+ },
88
+ {
89
+ "epoch": 0.6959822840873141,
90
+ "grad_norm": 12.32780647277832,
91
+ "learning_rate": 4.26629160806376e-05,
92
+ "loss": 4.2314,
93
+ "step": 1100
94
+ },
95
+ {
96
+ "epoch": 0.7592534008225246,
97
+ "grad_norm": 15.515801429748535,
98
+ "learning_rate": 4.149085794655415e-05,
99
+ "loss": 4.3058,
100
+ "step": 1200
101
+ },
102
+ {
103
+ "epoch": 0.8225245175577349,
104
+ "grad_norm": 12.472834587097168,
105
+ "learning_rate": 4.03187998124707e-05,
106
+ "loss": 4.2158,
107
+ "step": 1300
108
+ },
109
+ {
110
+ "epoch": 0.8857956342929453,
111
+ "grad_norm": 16.903112411499023,
112
+ "learning_rate": 3.914674167838725e-05,
113
+ "loss": 4.142,
114
+ "step": 1400
115
+ },
116
+ {
117
+ "epoch": 0.9490667510281556,
118
+ "grad_norm": 13.487791061401367,
119
+ "learning_rate": 3.79746835443038e-05,
120
+ "loss": 4.0534,
121
+ "step": 1500
122
+ },
123
+ {
124
+ "epoch": 1.012337867763366,
125
+ "grad_norm": 14.721494674682617,
126
+ "learning_rate": 3.680262541022035e-05,
127
+ "loss": 4.1405,
128
+ "step": 1600
129
+ },
130
+ {
131
+ "epoch": 1.0756089844985763,
132
+ "grad_norm": 16.011690139770508,
133
+ "learning_rate": 3.56305672761369e-05,
134
+ "loss": 4.1192,
135
+ "step": 1700
136
+ },
137
+ {
138
+ "epoch": 1.1388801012337868,
139
+ "grad_norm": 15.692846298217773,
140
+ "learning_rate": 3.445850914205345e-05,
141
+ "loss": 4.1407,
142
+ "step": 1800
143
+ },
144
+ {
145
+ "epoch": 1.2021512179689973,
146
+ "grad_norm": 13.71811294555664,
147
+ "learning_rate": 3.328645100797e-05,
148
+ "loss": 4.1839,
149
+ "step": 1900
150
+ },
151
+ {
152
+ "epoch": 1.2654223347042075,
153
+ "grad_norm": 13.474448204040527,
154
+ "learning_rate": 3.2114392873886545e-05,
155
+ "loss": 4.0813,
156
+ "step": 2000
157
+ },
158
+ {
159
+ "epoch": 1.2654223347042075,
160
+ "eval_runtime": 33.0889,
161
+ "eval_samples_per_second": 95.5,
162
+ "eval_steps_per_second": 11.938,
163
+ "step": 2000
164
+ },
165
+ {
166
+ "epoch": 1.328693451439418,
167
+ "grad_norm": 15.276654243469238,
168
+ "learning_rate": 3.09423347398031e-05,
169
+ "loss": 3.9624,
170
+ "step": 2100
171
+ },
172
+ {
173
+ "epoch": 1.3919645681746282,
174
+ "grad_norm": 13.796238899230957,
175
+ "learning_rate": 2.9770276605719643e-05,
176
+ "loss": 4.0653,
177
+ "step": 2200
178
+ },
179
+ {
180
+ "epoch": 1.4552356849098387,
181
+ "grad_norm": 16.452594757080078,
182
+ "learning_rate": 2.8598218471636194e-05,
183
+ "loss": 4.0338,
184
+ "step": 2300
185
+ },
186
+ {
187
+ "epoch": 1.518506801645049,
188
+ "grad_norm": 20.27753257751465,
189
+ "learning_rate": 2.7426160337552742e-05,
190
+ "loss": 4.0962,
191
+ "step": 2400
192
+ },
193
+ {
194
+ "epoch": 1.5817779183802594,
195
+ "grad_norm": 15.492554664611816,
196
+ "learning_rate": 2.6254102203469293e-05,
197
+ "loss": 4.057,
198
+ "step": 2500
199
+ },
200
+ {
201
+ "epoch": 1.6450490351154698,
202
+ "grad_norm": 15.298516273498535,
203
+ "learning_rate": 2.508204406938584e-05,
204
+ "loss": 4.0301,
205
+ "step": 2600
206
+ },
207
+ {
208
+ "epoch": 1.70832015185068,
209
+ "grad_norm": 15.670785903930664,
210
+ "learning_rate": 2.3909985935302392e-05,
211
+ "loss": 4.0129,
212
+ "step": 2700
213
+ },
214
+ {
215
+ "epoch": 1.7715912685858906,
216
+ "grad_norm": 18.541555404663086,
217
+ "learning_rate": 2.2737927801218943e-05,
218
+ "loss": 3.9724,
219
+ "step": 2800
220
+ },
221
+ {
222
+ "epoch": 1.834862385321101,
223
+ "grad_norm": 19.13411521911621,
224
+ "learning_rate": 2.156586966713549e-05,
225
+ "loss": 4.0044,
226
+ "step": 2900
227
+ },
228
+ {
229
+ "epoch": 1.8981335020563113,
230
+ "grad_norm": 14.532624244689941,
231
+ "learning_rate": 2.039381153305204e-05,
232
+ "loss": 3.9882,
233
+ "step": 3000
234
+ },
235
+ {
236
+ "epoch": 1.8981335020563113,
237
+ "eval_runtime": 33.1181,
238
+ "eval_samples_per_second": 95.416,
239
+ "eval_steps_per_second": 11.927,
240
+ "step": 3000
241
+ },
242
+ {
243
+ "epoch": 1.9614046187915217,
244
+ "grad_norm": 15.767202377319336,
245
+ "learning_rate": 1.922175339896859e-05,
246
+ "loss": 3.9372,
247
+ "step": 3100
248
+ },
249
+ {
250
+ "epoch": 2.024675735526732,
251
+ "grad_norm": 17.210546493530273,
252
+ "learning_rate": 1.804969526488514e-05,
253
+ "loss": 3.9757,
254
+ "step": 3200
255
+ },
256
+ {
257
+ "epoch": 2.0879468522619424,
258
+ "grad_norm": 15.209254264831543,
259
+ "learning_rate": 1.6877637130801688e-05,
260
+ "loss": 3.9668,
261
+ "step": 3300
262
+ },
263
+ {
264
+ "epoch": 2.1512179689971527,
265
+ "grad_norm": 16.821176528930664,
266
+ "learning_rate": 1.570557899671824e-05,
267
+ "loss": 3.9732,
268
+ "step": 3400
269
+ },
270
+ {
271
+ "epoch": 2.2144890857323634,
272
+ "grad_norm": 15.914960861206055,
273
+ "learning_rate": 1.4533520862634786e-05,
274
+ "loss": 3.9375,
275
+ "step": 3500
276
+ },
277
+ {
278
+ "epoch": 2.2777602024675736,
279
+ "grad_norm": 16.489627838134766,
280
+ "learning_rate": 1.3361462728551336e-05,
281
+ "loss": 3.9993,
282
+ "step": 3600
283
+ },
284
+ {
285
+ "epoch": 2.341031319202784,
286
+ "grad_norm": 17.943021774291992,
287
+ "learning_rate": 1.2189404594467887e-05,
288
+ "loss": 3.9889,
289
+ "step": 3700
290
+ },
291
+ {
292
+ "epoch": 2.4043024359379945,
293
+ "grad_norm": 14.150239944458008,
294
+ "learning_rate": 1.1017346460384436e-05,
295
+ "loss": 4.0,
296
+ "step": 3800
297
+ },
298
+ {
299
+ "epoch": 2.4675735526732048,
300
+ "grad_norm": 15.843707084655762,
301
+ "learning_rate": 9.845288326300985e-06,
302
+ "loss": 3.9527,
303
+ "step": 3900
304
+ },
305
+ {
306
+ "epoch": 2.530844669408415,
307
+ "grad_norm": 16.922142028808594,
308
+ "learning_rate": 8.673230192217533e-06,
309
+ "loss": 3.8801,
310
+ "step": 4000
311
+ },
312
+ {
313
+ "epoch": 2.530844669408415,
314
+ "eval_runtime": 33.0326,
315
+ "eval_samples_per_second": 95.663,
316
+ "eval_steps_per_second": 11.958,
317
+ "step": 4000
318
+ }
319
+ ],
320
+ "logging_steps": 100,
321
+ "max_steps": 4740,
322
+ "num_input_tokens_seen": 0,
323
+ "num_train_epochs": 3,
324
+ "save_steps": 1000,
325
+ "stateful_callbacks": {
326
+ "TrainerControl": {
327
+ "args": {
328
+ "should_epoch_stop": false,
329
+ "should_evaluate": false,
330
+ "should_log": false,
331
+ "should_save": true,
332
+ "should_training_stop": false
333
+ },
334
+ "attributes": {}
335
+ }
336
+ },
337
+ "total_flos": 1700301256769430.0,
338
+ "train_batch_size": 8,
339
+ "trial_name": null,
340
+ "trial_params": null
341
+ }
train_data/muril_bh_domain/checkpoint-4000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a22c26ec129497c28db8c0928a06ac51978ce151cffa0ea84f44917313aa4fe
3
+ size 5304
train_data/muril_bh_domain/checkpoint-4740/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/muril-base-cased
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
train_data/muril_bh_domain/checkpoint-4740/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "BertForMaskedLM",
5
+ "parent_library": "transformers.models.bert.modeling_bert"
6
+ },
7
+ "base_model_name_or_path": "google/muril-base-cased",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "query",
27
+ "value"
28
+ ],
29
+ "task_type": null,
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
train_data/muril_bh_domain/checkpoint-4740/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc7f62c24737614448fd04808465491ac77f32241a0a2a9cab226b74deb95ad2
3
+ size 2366064
train_data/muril_bh_domain/checkpoint-4740/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e80f2c68afd84173016108d5470b712a09e93d00829013e7b7624072079df805
3
+ size 4759290
train_data/muril_bh_domain/checkpoint-4740/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c9afe1aba4f99f67ff47a476d35ef83b49ca75cc2da61962433d26d445986b7
3
+ size 14244
train_data/muril_bh_domain/checkpoint-4740/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45472c9e15a883df2692bf1340ef4c87defa24d90bfd8aefefaaecfb0aac3aa4
3
+ size 1064
train_data/muril_bh_domain/checkpoint-4740/trainer_state.json ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.999050933248972,
5
+ "eval_steps": 1000,
6
+ "global_step": 4740,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06327111673521038,
13
+ "grad_norm": 4.5707688331604,
14
+ "learning_rate": 1.0548523206751056e-05,
15
+ "loss": 6.0536,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.12654223347042076,
20
+ "grad_norm": 7.803924083709717,
21
+ "learning_rate": 2.1097046413502112e-05,
22
+ "loss": 5.8177,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.18981335020563114,
27
+ "grad_norm": 8.050126075744629,
28
+ "learning_rate": 3.1645569620253167e-05,
29
+ "loss": 5.0311,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.2530844669408415,
34
+ "grad_norm": 8.803975105285645,
35
+ "learning_rate": 4.2194092827004224e-05,
36
+ "loss": 4.7351,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.3163555836760519,
41
+ "grad_norm": 12.846040725708008,
42
+ "learning_rate": 4.96952648851383e-05,
43
+ "loss": 4.5453,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.3796267004112623,
48
+ "grad_norm": 15.128095626831055,
49
+ "learning_rate": 4.852320675105486e-05,
50
+ "loss": 4.3979,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.44289781714647264,
55
+ "grad_norm": 10.33956527709961,
56
+ "learning_rate": 4.7351148616971405e-05,
57
+ "loss": 4.4056,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.506168933881683,
62
+ "grad_norm": 23.287179946899414,
63
+ "learning_rate": 4.617909048288795e-05,
64
+ "loss": 4.2787,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.5694400506168934,
69
+ "grad_norm": 12.955565452575684,
70
+ "learning_rate": 4.50070323488045e-05,
71
+ "loss": 4.3504,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.6327111673521038,
76
+ "grad_norm": 10.07479476928711,
77
+ "learning_rate": 4.3834974214721055e-05,
78
+ "loss": 4.2521,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.6327111673521038,
83
+ "eval_runtime": 33.0567,
84
+ "eval_samples_per_second": 95.593,
85
+ "eval_steps_per_second": 11.949,
86
+ "step": 1000
87
+ },
88
+ {
89
+ "epoch": 0.6959822840873141,
90
+ "grad_norm": 12.32780647277832,
91
+ "learning_rate": 4.26629160806376e-05,
92
+ "loss": 4.2314,
93
+ "step": 1100
94
+ },
95
+ {
96
+ "epoch": 0.7592534008225246,
97
+ "grad_norm": 15.515801429748535,
98
+ "learning_rate": 4.149085794655415e-05,
99
+ "loss": 4.3058,
100
+ "step": 1200
101
+ },
102
+ {
103
+ "epoch": 0.8225245175577349,
104
+ "grad_norm": 12.472834587097168,
105
+ "learning_rate": 4.03187998124707e-05,
106
+ "loss": 4.2158,
107
+ "step": 1300
108
+ },
109
+ {
110
+ "epoch": 0.8857956342929453,
111
+ "grad_norm": 16.903112411499023,
112
+ "learning_rate": 3.914674167838725e-05,
113
+ "loss": 4.142,
114
+ "step": 1400
115
+ },
116
+ {
117
+ "epoch": 0.9490667510281556,
118
+ "grad_norm": 13.487791061401367,
119
+ "learning_rate": 3.79746835443038e-05,
120
+ "loss": 4.0534,
121
+ "step": 1500
122
+ },
123
+ {
124
+ "epoch": 1.012337867763366,
125
+ "grad_norm": 14.721494674682617,
126
+ "learning_rate": 3.680262541022035e-05,
127
+ "loss": 4.1405,
128
+ "step": 1600
129
+ },
130
+ {
131
+ "epoch": 1.0756089844985763,
132
+ "grad_norm": 16.011690139770508,
133
+ "learning_rate": 3.56305672761369e-05,
134
+ "loss": 4.1192,
135
+ "step": 1700
136
+ },
137
+ {
138
+ "epoch": 1.1388801012337868,
139
+ "grad_norm": 15.692846298217773,
140
+ "learning_rate": 3.445850914205345e-05,
141
+ "loss": 4.1407,
142
+ "step": 1800
143
+ },
144
+ {
145
+ "epoch": 1.2021512179689973,
146
+ "grad_norm": 13.71811294555664,
147
+ "learning_rate": 3.328645100797e-05,
148
+ "loss": 4.1839,
149
+ "step": 1900
150
+ },
151
+ {
152
+ "epoch": 1.2654223347042075,
153
+ "grad_norm": 13.474448204040527,
154
+ "learning_rate": 3.2114392873886545e-05,
155
+ "loss": 4.0813,
156
+ "step": 2000
157
+ },
158
+ {
159
+ "epoch": 1.2654223347042075,
160
+ "eval_runtime": 33.0889,
161
+ "eval_samples_per_second": 95.5,
162
+ "eval_steps_per_second": 11.938,
163
+ "step": 2000
164
+ },
165
+ {
166
+ "epoch": 1.328693451439418,
167
+ "grad_norm": 15.276654243469238,
168
+ "learning_rate": 3.09423347398031e-05,
169
+ "loss": 3.9624,
170
+ "step": 2100
171
+ },
172
+ {
173
+ "epoch": 1.3919645681746282,
174
+ "grad_norm": 13.796238899230957,
175
+ "learning_rate": 2.9770276605719643e-05,
176
+ "loss": 4.0653,
177
+ "step": 2200
178
+ },
179
+ {
180
+ "epoch": 1.4552356849098387,
181
+ "grad_norm": 16.452594757080078,
182
+ "learning_rate": 2.8598218471636194e-05,
183
+ "loss": 4.0338,
184
+ "step": 2300
185
+ },
186
+ {
187
+ "epoch": 1.518506801645049,
188
+ "grad_norm": 20.27753257751465,
189
+ "learning_rate": 2.7426160337552742e-05,
190
+ "loss": 4.0962,
191
+ "step": 2400
192
+ },
193
+ {
194
+ "epoch": 1.5817779183802594,
195
+ "grad_norm": 15.492554664611816,
196
+ "learning_rate": 2.6254102203469293e-05,
197
+ "loss": 4.057,
198
+ "step": 2500
199
+ },
200
+ {
201
+ "epoch": 1.6450490351154698,
202
+ "grad_norm": 15.298516273498535,
203
+ "learning_rate": 2.508204406938584e-05,
204
+ "loss": 4.0301,
205
+ "step": 2600
206
+ },
207
+ {
208
+ "epoch": 1.70832015185068,
209
+ "grad_norm": 15.670785903930664,
210
+ "learning_rate": 2.3909985935302392e-05,
211
+ "loss": 4.0129,
212
+ "step": 2700
213
+ },
214
+ {
215
+ "epoch": 1.7715912685858906,
216
+ "grad_norm": 18.541555404663086,
217
+ "learning_rate": 2.2737927801218943e-05,
218
+ "loss": 3.9724,
219
+ "step": 2800
220
+ },
221
+ {
222
+ "epoch": 1.834862385321101,
223
+ "grad_norm": 19.13411521911621,
224
+ "learning_rate": 2.156586966713549e-05,
225
+ "loss": 4.0044,
226
+ "step": 2900
227
+ },
228
+ {
229
+ "epoch": 1.8981335020563113,
230
+ "grad_norm": 14.532624244689941,
231
+ "learning_rate": 2.039381153305204e-05,
232
+ "loss": 3.9882,
233
+ "step": 3000
234
+ },
235
+ {
236
+ "epoch": 1.8981335020563113,
237
+ "eval_runtime": 33.1181,
238
+ "eval_samples_per_second": 95.416,
239
+ "eval_steps_per_second": 11.927,
240
+ "step": 3000
241
+ },
242
+ {
243
+ "epoch": 1.9614046187915217,
244
+ "grad_norm": 15.767202377319336,
245
+ "learning_rate": 1.922175339896859e-05,
246
+ "loss": 3.9372,
247
+ "step": 3100
248
+ },
249
+ {
250
+ "epoch": 2.024675735526732,
251
+ "grad_norm": 17.210546493530273,
252
+ "learning_rate": 1.804969526488514e-05,
253
+ "loss": 3.9757,
254
+ "step": 3200
255
+ },
256
+ {
257
+ "epoch": 2.0879468522619424,
258
+ "grad_norm": 15.209254264831543,
259
+ "learning_rate": 1.6877637130801688e-05,
260
+ "loss": 3.9668,
261
+ "step": 3300
262
+ },
263
+ {
264
+ "epoch": 2.1512179689971527,
265
+ "grad_norm": 16.821176528930664,
266
+ "learning_rate": 1.570557899671824e-05,
267
+ "loss": 3.9732,
268
+ "step": 3400
269
+ },
270
+ {
271
+ "epoch": 2.2144890857323634,
272
+ "grad_norm": 15.914960861206055,
273
+ "learning_rate": 1.4533520862634786e-05,
274
+ "loss": 3.9375,
275
+ "step": 3500
276
+ },
277
+ {
278
+ "epoch": 2.2777602024675736,
279
+ "grad_norm": 16.489627838134766,
280
+ "learning_rate": 1.3361462728551336e-05,
281
+ "loss": 3.9993,
282
+ "step": 3600
283
+ },
284
+ {
285
+ "epoch": 2.341031319202784,
286
+ "grad_norm": 17.943021774291992,
287
+ "learning_rate": 1.2189404594467887e-05,
288
+ "loss": 3.9889,
289
+ "step": 3700
290
+ },
291
+ {
292
+ "epoch": 2.4043024359379945,
293
+ "grad_norm": 14.150239944458008,
294
+ "learning_rate": 1.1017346460384436e-05,
295
+ "loss": 4.0,
296
+ "step": 3800
297
+ },
298
+ {
299
+ "epoch": 2.4675735526732048,
300
+ "grad_norm": 15.843707084655762,
301
+ "learning_rate": 9.845288326300985e-06,
302
+ "loss": 3.9527,
303
+ "step": 3900
304
+ },
305
+ {
306
+ "epoch": 2.530844669408415,
307
+ "grad_norm": 16.922142028808594,
308
+ "learning_rate": 8.673230192217533e-06,
309
+ "loss": 3.8801,
310
+ "step": 4000
311
+ },
312
+ {
313
+ "epoch": 2.530844669408415,
314
+ "eval_runtime": 33.0326,
315
+ "eval_samples_per_second": 95.663,
316
+ "eval_steps_per_second": 11.958,
317
+ "step": 4000
318
+ },
319
+ {
320
+ "epoch": 2.5941157861436253,
321
+ "grad_norm": 19.046831130981445,
322
+ "learning_rate": 7.501172058134085e-06,
323
+ "loss": 3.9705,
324
+ "step": 4100
325
+ },
326
+ {
327
+ "epoch": 2.657386902878836,
328
+ "grad_norm": 15.516547203063965,
329
+ "learning_rate": 6.329113924050633e-06,
330
+ "loss": 3.9296,
331
+ "step": 4200
332
+ },
333
+ {
334
+ "epoch": 2.720658019614046,
335
+ "grad_norm": 16.0513916015625,
336
+ "learning_rate": 5.157055789967183e-06,
337
+ "loss": 4.0594,
338
+ "step": 4300
339
+ },
340
+ {
341
+ "epoch": 2.7839291363492564,
342
+ "grad_norm": 13.820748329162598,
343
+ "learning_rate": 3.984997655883732e-06,
344
+ "loss": 3.9353,
345
+ "step": 4400
346
+ },
347
+ {
348
+ "epoch": 2.847200253084467,
349
+ "grad_norm": 14.607758522033691,
350
+ "learning_rate": 2.8129395218002813e-06,
351
+ "loss": 4.0605,
352
+ "step": 4500
353
+ },
354
+ {
355
+ "epoch": 2.9104713698196774,
356
+ "grad_norm": 18.928266525268555,
357
+ "learning_rate": 1.6408813877168308e-06,
358
+ "loss": 3.9663,
359
+ "step": 4600
360
+ },
361
+ {
362
+ "epoch": 2.9737424865548876,
363
+ "grad_norm": 17.247835159301758,
364
+ "learning_rate": 4.688232536333803e-07,
365
+ "loss": 3.9057,
366
+ "step": 4700
367
+ }
368
+ ],
369
+ "logging_steps": 100,
370
+ "max_steps": 4740,
371
+ "num_input_tokens_seen": 0,
372
+ "num_train_epochs": 3,
373
+ "save_steps": 1000,
374
+ "stateful_callbacks": {
375
+ "TrainerControl": {
376
+ "args": {
377
+ "should_epoch_stop": false,
378
+ "should_evaluate": false,
379
+ "should_log": false,
380
+ "should_save": true,
381
+ "should_training_stop": true
382
+ },
383
+ "attributes": {}
384
+ }
385
+ },
386
+ "total_flos": 2016724755342822.0,
387
+ "train_batch_size": 8,
388
+ "trial_name": null,
389
+ "trial_params": null
390
+ }
train_data/muril_bh_domain/checkpoint-4740/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a22c26ec129497c28db8c0928a06ac51978ce151cffa0ea84f44917313aa4fe
3
+ size 5304
train_data/muril_bh_domain/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/muril-base-cased",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.46.3",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 197285
26
+ }
train_data/muril_bh_domain/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.46.3"
5
+ }
train_data/muril_bh_domain/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df8f3e88e0c7d9391d346c9abb7e6b7b0729bf480dc131d713e7822db8e5b2a1
3
+ size 951043900
train_data/muril_bh_domain/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
train_data/muril_bh_domain/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
train_data/muril_bh_domain/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "103": {
20
+ "content": "[MASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "104": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "105": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "lowercase": false,
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": false,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
train_data/muril_bh_domain/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
train_data/muril_bn_domain/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/muril-base-cased",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.46.3",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 197285
26
+ }
train_data/muril_bn_domain/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.46.3"
5
+ }
train_data/muril_bn_domain/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec9f38111bd7b18e2536f6a68716b77d056876facaf70df2e15d69a6e775c40b
3
+ size 951043900
train_data/muril_bn_domain/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
train_data/muril_bn_domain/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
train_data/muril_bn_domain/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "103": {
20
+ "content": "[MASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "104": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "105": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "lowercase": false,
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": false,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
train_data/muril_bn_domain/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7045f6bb9f738a9d8ede00a3453edc500b8eb579edea5aaeff971f5a4baffd29
3
+ size 5304
train_data/muril_bn_domain/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
train_data/muril_ch_domain/checkpoint-30500/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/muril-base-cased
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
train_data/muril_ch_domain/checkpoint-30500/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "BertForMaskedLM",
5
+ "parent_library": "transformers.models.bert.modeling_bert"
6
+ },
7
+ "base_model_name_or_path": "google/muril-base-cased",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 128,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 64,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "key",
27
+ "query",
28
+ "value",
29
+ "dense"
30
+ ],
31
+ "task_type": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
train_data/muril_ch_domain/checkpoint-30500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a355714bc7655686649d935524ea8155c31088d897e38f8e91446e8d3699105
3
+ size 42881168
train_data/muril_ch_domain/checkpoint-30500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e53fc7022d7bc695ba1e3ff7fdd1b3368c75e2346cdc9c402d6558e85d67284b
3
+ size 85843898
train_data/muril_ch_domain/checkpoint-30500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dcba7c160327f0ad67913f451ea37b39145ec96b984b91f6a4a72b0a5056736
3
+ size 14244
train_data/muril_ch_domain/checkpoint-30500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e37c8f6a8cf0a15e09cca408bc24278664576b9301e4288b0b47f7768d94d82b
3
+ size 1064
train_data/muril_ch_domain/checkpoint-30500/trainer_state.json ADDED
@@ -0,0 +1,2595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9839064716528885,
5
+ "eval_steps": 500,
6
+ "global_step": 30500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009783299907058651,
13
+ "grad_norm": 33.780609130859375,
14
+ "learning_rate": 1.6302575806977503e-06,
15
+ "loss": 6.574,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.019566599814117302,
20
+ "grad_norm": 33.439022064208984,
21
+ "learning_rate": 3.2605151613955006e-06,
22
+ "loss": 6.1653,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.02934989972117595,
27
+ "grad_norm": 27.25751304626465,
28
+ "learning_rate": 4.890772742093251e-06,
29
+ "loss": 5.5515,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.039133199628234604,
34
+ "grad_norm": 38.36979675292969,
35
+ "learning_rate": 6.521030322791001e-06,
36
+ "loss": 5.0531,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.048916499535293256,
41
+ "grad_norm": 29.350488662719727,
42
+ "learning_rate": 8.15128790348875e-06,
43
+ "loss": 4.9225,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.048916499535293256,
48
+ "eval_runtime": 181.5812,
49
+ "eval_samples_per_second": 112.578,
50
+ "eval_steps_per_second": 14.076,
51
+ "step": 500
52
+ },
53
+ {
54
+ "epoch": 0.0586997994423519,
55
+ "grad_norm": 33.02122497558594,
56
+ "learning_rate": 9.781545484186502e-06,
57
+ "loss": 4.8186,
58
+ "step": 600
59
+ },
60
+ {
61
+ "epoch": 0.06848309934941056,
62
+ "grad_norm": 42.41593933105469,
63
+ "learning_rate": 1.1411803064884251e-05,
64
+ "loss": 4.5769,
65
+ "step": 700
66
+ },
67
+ {
68
+ "epoch": 0.07826639925646921,
69
+ "grad_norm": 40.29044723510742,
70
+ "learning_rate": 1.3042060645582003e-05,
71
+ "loss": 4.3963,
72
+ "step": 800
73
+ },
74
+ {
75
+ "epoch": 0.08804969916352785,
76
+ "grad_norm": 38.0811653137207,
77
+ "learning_rate": 1.4672318226279752e-05,
78
+ "loss": 4.3393,
79
+ "step": 900
80
+ },
81
+ {
82
+ "epoch": 0.09783299907058651,
83
+ "grad_norm": 36.08370590209961,
84
+ "learning_rate": 1.63025758069775e-05,
85
+ "loss": 4.2421,
86
+ "step": 1000
87
+ },
88
+ {
89
+ "epoch": 0.09783299907058651,
90
+ "eval_runtime": 181.886,
91
+ "eval_samples_per_second": 112.389,
92
+ "eval_steps_per_second": 14.053,
93
+ "step": 1000
94
+ },
95
+ {
96
+ "epoch": 0.10761629897764516,
97
+ "grad_norm": 37.253684997558594,
98
+ "learning_rate": 1.7932833387675256e-05,
99
+ "loss": 4.1156,
100
+ "step": 1100
101
+ },
102
+ {
103
+ "epoch": 0.1173995988847038,
104
+ "grad_norm": 33.003475189208984,
105
+ "learning_rate": 1.9563090968373004e-05,
106
+ "loss": 4.0112,
107
+ "step": 1200
108
+ },
109
+ {
110
+ "epoch": 0.12718289879176245,
111
+ "grad_norm": 30.727867126464844,
112
+ "learning_rate": 2.1193348549070755e-05,
113
+ "loss": 3.9969,
114
+ "step": 1300
115
+ },
116
+ {
117
+ "epoch": 0.13696619869882112,
118
+ "grad_norm": 37.471092224121094,
119
+ "learning_rate": 2.2823606129768503e-05,
120
+ "loss": 3.874,
121
+ "step": 1400
122
+ },
123
+ {
124
+ "epoch": 0.14674949860587977,
125
+ "grad_norm": 42.32167434692383,
126
+ "learning_rate": 2.4453863710466254e-05,
127
+ "loss": 3.8518,
128
+ "step": 1500
129
+ },
130
+ {
131
+ "epoch": 0.14674949860587977,
132
+ "eval_runtime": 181.9332,
133
+ "eval_samples_per_second": 112.36,
134
+ "eval_steps_per_second": 14.049,
135
+ "step": 1500
136
+ },
137
+ {
138
+ "epoch": 0.15653279851293841,
139
+ "grad_norm": 38.00124740600586,
140
+ "learning_rate": 2.6084121291164005e-05,
141
+ "loss": 3.918,
142
+ "step": 1600
143
+ },
144
+ {
145
+ "epoch": 0.16631609841999706,
146
+ "grad_norm": 44.637386322021484,
147
+ "learning_rate": 2.7714378871861756e-05,
148
+ "loss": 3.9134,
149
+ "step": 1700
150
+ },
151
+ {
152
+ "epoch": 0.1760993983270557,
153
+ "grad_norm": 49.578609466552734,
154
+ "learning_rate": 2.9344636452559504e-05,
155
+ "loss": 3.7507,
156
+ "step": 1800
157
+ },
158
+ {
159
+ "epoch": 0.18588269823411438,
160
+ "grad_norm": 36.65715789794922,
161
+ "learning_rate": 3.0974894033257255e-05,
162
+ "loss": 3.7551,
163
+ "step": 1900
164
+ },
165
+ {
166
+ "epoch": 0.19566599814117303,
167
+ "grad_norm": 36.873443603515625,
168
+ "learning_rate": 3.2605151613955e-05,
169
+ "loss": 3.6951,
170
+ "step": 2000
171
+ },
172
+ {
173
+ "epoch": 0.19566599814117303,
174
+ "eval_runtime": 181.8273,
175
+ "eval_samples_per_second": 112.425,
176
+ "eval_steps_per_second": 14.057,
177
+ "step": 2000
178
+ },
179
+ {
180
+ "epoch": 0.20544929804823167,
181
+ "grad_norm": 33.025413513183594,
182
+ "learning_rate": 3.423540919465276e-05,
183
+ "loss": 3.6603,
184
+ "step": 2100
185
+ },
186
+ {
187
+ "epoch": 0.21523259795529032,
188
+ "grad_norm": 30.105051040649414,
189
+ "learning_rate": 3.586566677535051e-05,
190
+ "loss": 3.525,
191
+ "step": 2200
192
+ },
193
+ {
194
+ "epoch": 0.22501589786234896,
195
+ "grad_norm": 34.5129280090332,
196
+ "learning_rate": 3.749592435604825e-05,
197
+ "loss": 3.6454,
198
+ "step": 2300
199
+ },
200
+ {
201
+ "epoch": 0.2347991977694076,
202
+ "grad_norm": 33.16934585571289,
203
+ "learning_rate": 3.912618193674601e-05,
204
+ "loss": 3.6356,
205
+ "step": 2400
206
+ },
207
+ {
208
+ "epoch": 0.24458249767646628,
209
+ "grad_norm": 33.5789794921875,
210
+ "learning_rate": 4.0756439517443756e-05,
211
+ "loss": 3.5605,
212
+ "step": 2500
213
+ },
214
+ {
215
+ "epoch": 0.24458249767646628,
216
+ "eval_runtime": 181.7254,
217
+ "eval_samples_per_second": 112.488,
218
+ "eval_steps_per_second": 14.065,
219
+ "step": 2500
220
+ },
221
+ {
222
+ "epoch": 0.2543657975835249,
223
+ "grad_norm": 34.30876159667969,
224
+ "learning_rate": 4.238669709814151e-05,
225
+ "loss": 3.5447,
226
+ "step": 2600
227
+ },
228
+ {
229
+ "epoch": 0.2641490974905836,
230
+ "grad_norm": 29.907989501953125,
231
+ "learning_rate": 4.401695467883926e-05,
232
+ "loss": 3.5116,
233
+ "step": 2700
234
+ },
235
+ {
236
+ "epoch": 0.27393239739764225,
237
+ "grad_norm": 34.08231735229492,
238
+ "learning_rate": 4.5647212259537006e-05,
239
+ "loss": 3.4941,
240
+ "step": 2800
241
+ },
242
+ {
243
+ "epoch": 0.28371569730470086,
244
+ "grad_norm": 25.034149169921875,
245
+ "learning_rate": 4.727746984023476e-05,
246
+ "loss": 3.4863,
247
+ "step": 2900
248
+ },
249
+ {
250
+ "epoch": 0.29349899721175954,
251
+ "grad_norm": 32.21685028076172,
252
+ "learning_rate": 4.890772742093251e-05,
253
+ "loss": 3.5096,
254
+ "step": 3000
255
+ },
256
+ {
257
+ "epoch": 0.29349899721175954,
258
+ "eval_runtime": 181.6612,
259
+ "eval_samples_per_second": 112.528,
260
+ "eval_steps_per_second": 14.07,
261
+ "step": 3000
262
+ },
263
+ {
264
+ "epoch": 0.30328229711881816,
265
+ "grad_norm": 24.290380477905273,
266
+ "learning_rate": 4.9940208725902305e-05,
267
+ "loss": 3.3867,
268
+ "step": 3100
269
+ },
270
+ {
271
+ "epoch": 0.31306559702587683,
272
+ "grad_norm": 22.924575805664062,
273
+ "learning_rate": 4.975902304681838e-05,
274
+ "loss": 3.398,
275
+ "step": 3200
276
+ },
277
+ {
278
+ "epoch": 0.3228488969329355,
279
+ "grad_norm": 19.540430068969727,
280
+ "learning_rate": 4.957783736773446e-05,
281
+ "loss": 3.3727,
282
+ "step": 3300
283
+ },
284
+ {
285
+ "epoch": 0.3326321968399941,
286
+ "grad_norm": 22.529376983642578,
287
+ "learning_rate": 4.939665168865053e-05,
288
+ "loss": 3.3364,
289
+ "step": 3400
290
+ },
291
+ {
292
+ "epoch": 0.3424154967470528,
293
+ "grad_norm": 20.821264266967773,
294
+ "learning_rate": 4.921546600956661e-05,
295
+ "loss": 3.3126,
296
+ "step": 3500
297
+ },
298
+ {
299
+ "epoch": 0.3424154967470528,
300
+ "eval_runtime": 181.7582,
301
+ "eval_samples_per_second": 112.468,
302
+ "eval_steps_per_second": 14.063,
303
+ "step": 3500
304
+ },
305
+ {
306
+ "epoch": 0.3521987966541114,
307
+ "grad_norm": 24.346153259277344,
308
+ "learning_rate": 4.903428033048268e-05,
309
+ "loss": 3.2678,
310
+ "step": 3600
311
+ },
312
+ {
313
+ "epoch": 0.3619820965611701,
314
+ "grad_norm": 19.89035415649414,
315
+ "learning_rate": 4.8853094651398754e-05,
316
+ "loss": 3.3233,
317
+ "step": 3700
318
+ },
319
+ {
320
+ "epoch": 0.37176539646822876,
321
+ "grad_norm": 17.938880920410156,
322
+ "learning_rate": 4.8671908972314825e-05,
323
+ "loss": 3.2822,
324
+ "step": 3800
325
+ },
326
+ {
327
+ "epoch": 0.3815486963752874,
328
+ "grad_norm": 16.92071533203125,
329
+ "learning_rate": 4.84907232932309e-05,
330
+ "loss": 3.2254,
331
+ "step": 3900
332
+ },
333
+ {
334
+ "epoch": 0.39133199628234605,
335
+ "grad_norm": 18.241249084472656,
336
+ "learning_rate": 4.830953761414698e-05,
337
+ "loss": 3.2116,
338
+ "step": 4000
339
+ },
340
+ {
341
+ "epoch": 0.39133199628234605,
342
+ "eval_runtime": 182.8906,
343
+ "eval_samples_per_second": 111.772,
344
+ "eval_steps_per_second": 13.976,
345
+ "step": 4000
346
+ },
347
+ {
348
+ "epoch": 0.40111529618940467,
349
+ "grad_norm": 17.56020736694336,
350
+ "learning_rate": 4.812835193506305e-05,
351
+ "loss": 3.2232,
352
+ "step": 4100
353
+ },
354
+ {
355
+ "epoch": 0.41089859609646334,
356
+ "grad_norm": 17.81117057800293,
357
+ "learning_rate": 4.794716625597913e-05,
358
+ "loss": 3.1936,
359
+ "step": 4200
360
+ },
361
+ {
362
+ "epoch": 0.420681896003522,
363
+ "grad_norm": 19.89581871032715,
364
+ "learning_rate": 4.77659805768952e-05,
365
+ "loss": 3.1443,
366
+ "step": 4300
367
+ },
368
+ {
369
+ "epoch": 0.43046519591058063,
370
+ "grad_norm": 22.968582153320312,
371
+ "learning_rate": 4.758479489781128e-05,
372
+ "loss": 3.2084,
373
+ "step": 4400
374
+ },
375
+ {
376
+ "epoch": 0.4402484958176393,
377
+ "grad_norm": 17.119598388671875,
378
+ "learning_rate": 4.740360921872735e-05,
379
+ "loss": 3.1263,
380
+ "step": 4500
381
+ },
382
+ {
383
+ "epoch": 0.4402484958176393,
384
+ "eval_runtime": 182.3246,
385
+ "eval_samples_per_second": 112.119,
386
+ "eval_steps_per_second": 14.019,
387
+ "step": 4500
388
+ },
389
+ {
390
+ "epoch": 0.4500317957246979,
391
+ "grad_norm": 19.294527053833008,
392
+ "learning_rate": 4.722242353964343e-05,
393
+ "loss": 3.1327,
394
+ "step": 4600
395
+ },
396
+ {
397
+ "epoch": 0.4598150956317566,
398
+ "grad_norm": 16.941057205200195,
399
+ "learning_rate": 4.704123786055951e-05,
400
+ "loss": 3.0944,
401
+ "step": 4700
402
+ },
403
+ {
404
+ "epoch": 0.4695983955388152,
405
+ "grad_norm": 22.43411636352539,
406
+ "learning_rate": 4.686005218147558e-05,
407
+ "loss": 3.1093,
408
+ "step": 4800
409
+ },
410
+ {
411
+ "epoch": 0.4793816954458739,
412
+ "grad_norm": 19.64097023010254,
413
+ "learning_rate": 4.667886650239166e-05,
414
+ "loss": 3.0597,
415
+ "step": 4900
416
+ },
417
+ {
418
+ "epoch": 0.48916499535293256,
419
+ "grad_norm": 19.343788146972656,
420
+ "learning_rate": 4.649768082330773e-05,
421
+ "loss": 3.1659,
422
+ "step": 5000
423
+ },
424
+ {
425
+ "epoch": 0.48916499535293256,
426
+ "eval_runtime": 181.8771,
427
+ "eval_samples_per_second": 112.395,
428
+ "eval_steps_per_second": 14.053,
429
+ "step": 5000
430
+ },
431
+ {
432
+ "epoch": 0.4989482952599912,
433
+ "grad_norm": 19.657760620117188,
434
+ "learning_rate": 4.63164951442238e-05,
435
+ "loss": 3.0506,
436
+ "step": 5100
437
+ },
438
+ {
439
+ "epoch": 0.5087315951670498,
440
+ "grad_norm": 16.2425537109375,
441
+ "learning_rate": 4.613530946513987e-05,
442
+ "loss": 3.0524,
443
+ "step": 5200
444
+ },
445
+ {
446
+ "epoch": 0.5185148950741085,
447
+ "grad_norm": 19.64779281616211,
448
+ "learning_rate": 4.595412378605595e-05,
449
+ "loss": 2.9995,
450
+ "step": 5300
451
+ },
452
+ {
453
+ "epoch": 0.5282981949811671,
454
+ "grad_norm": 17.29520606994629,
455
+ "learning_rate": 4.577293810697203e-05,
456
+ "loss": 3.0932,
457
+ "step": 5400
458
+ },
459
+ {
460
+ "epoch": 0.5380814948882258,
461
+ "grad_norm": 17.694602966308594,
462
+ "learning_rate": 4.55917524278881e-05,
463
+ "loss": 3.0309,
464
+ "step": 5500
465
+ },
466
+ {
467
+ "epoch": 0.5380814948882258,
468
+ "eval_runtime": 181.7231,
469
+ "eval_samples_per_second": 112.49,
470
+ "eval_steps_per_second": 14.065,
471
+ "step": 5500
472
+ },
473
+ {
474
+ "epoch": 0.5478647947952845,
475
+ "grad_norm": 21.030174255371094,
476
+ "learning_rate": 4.541056674880418e-05,
477
+ "loss": 3.0313,
478
+ "step": 5600
479
+ },
480
+ {
481
+ "epoch": 0.5576480947023431,
482
+ "grad_norm": 12.339129447937012,
483
+ "learning_rate": 4.522938106972025e-05,
484
+ "loss": 3.047,
485
+ "step": 5700
486
+ },
487
+ {
488
+ "epoch": 0.5674313946094017,
489
+ "grad_norm": 16.496389389038086,
490
+ "learning_rate": 4.504819539063633e-05,
491
+ "loss": 2.9961,
492
+ "step": 5800
493
+ },
494
+ {
495
+ "epoch": 0.5772146945164603,
496
+ "grad_norm": 15.456297874450684,
497
+ "learning_rate": 4.48670097115524e-05,
498
+ "loss": 2.9821,
499
+ "step": 5900
500
+ },
501
+ {
502
+ "epoch": 0.5869979944235191,
503
+ "grad_norm": 17.8603572845459,
504
+ "learning_rate": 4.468582403246848e-05,
505
+ "loss": 2.9294,
506
+ "step": 6000
507
+ },
508
+ {
509
+ "epoch": 0.5869979944235191,
510
+ "eval_runtime": 181.8258,
511
+ "eval_samples_per_second": 112.426,
512
+ "eval_steps_per_second": 14.057,
513
+ "step": 6000
514
+ },
515
+ {
516
+ "epoch": 0.5967812943305777,
517
+ "grad_norm": 18.85349464416504,
518
+ "learning_rate": 4.450463835338455e-05,
519
+ "loss": 2.9929,
520
+ "step": 6100
521
+ },
522
+ {
523
+ "epoch": 0.6065645942376363,
524
+ "grad_norm": 22.971813201904297,
525
+ "learning_rate": 4.432345267430063e-05,
526
+ "loss": 2.9684,
527
+ "step": 6200
528
+ },
529
+ {
530
+ "epoch": 0.616347894144695,
531
+ "grad_norm": 15.877230644226074,
532
+ "learning_rate": 4.4142266995216706e-05,
533
+ "loss": 2.9399,
534
+ "step": 6300
535
+ },
536
+ {
537
+ "epoch": 0.6261311940517537,
538
+ "grad_norm": 19.847482681274414,
539
+ "learning_rate": 4.396108131613278e-05,
540
+ "loss": 2.88,
541
+ "step": 6400
542
+ },
543
+ {
544
+ "epoch": 0.6359144939588123,
545
+ "grad_norm": 15.004170417785645,
546
+ "learning_rate": 4.377989563704885e-05,
547
+ "loss": 2.9719,
548
+ "step": 6500
549
+ },
550
+ {
551
+ "epoch": 0.6359144939588123,
552
+ "eval_runtime": 182.6045,
553
+ "eval_samples_per_second": 111.947,
554
+ "eval_steps_per_second": 13.997,
555
+ "step": 6500
556
+ },
557
+ {
558
+ "epoch": 0.645697793865871,
559
+ "grad_norm": 19.473665237426758,
560
+ "learning_rate": 4.359870995796492e-05,
561
+ "loss": 2.9246,
562
+ "step": 6600
563
+ },
564
+ {
565
+ "epoch": 0.6554810937729296,
566
+ "grad_norm": 18.071683883666992,
567
+ "learning_rate": 4.3417524278881e-05,
568
+ "loss": 2.9031,
569
+ "step": 6700
570
+ },
571
+ {
572
+ "epoch": 0.6652643936799882,
573
+ "grad_norm": 17.544504165649414,
574
+ "learning_rate": 4.323633859979707e-05,
575
+ "loss": 2.8313,
576
+ "step": 6800
577
+ },
578
+ {
579
+ "epoch": 0.6750476935870469,
580
+ "grad_norm": 18.936140060424805,
581
+ "learning_rate": 4.305515292071315e-05,
582
+ "loss": 2.8536,
583
+ "step": 6900
584
+ },
585
+ {
586
+ "epoch": 0.6848309934941056,
587
+ "grad_norm": 14.77696418762207,
588
+ "learning_rate": 4.2873967241629226e-05,
589
+ "loss": 2.9104,
590
+ "step": 7000
591
+ },
592
+ {
593
+ "epoch": 0.6848309934941056,
594
+ "eval_runtime": 181.938,
595
+ "eval_samples_per_second": 112.357,
596
+ "eval_steps_per_second": 14.049,
597
+ "step": 7000
598
+ },
599
+ {
600
+ "epoch": 0.6946142934011642,
601
+ "grad_norm": 14.303226470947266,
602
+ "learning_rate": 4.26927815625453e-05,
603
+ "loss": 2.8386,
604
+ "step": 7100
605
+ },
606
+ {
607
+ "epoch": 0.7043975933082228,
608
+ "grad_norm": 17.11782455444336,
609
+ "learning_rate": 4.2511595883461376e-05,
610
+ "loss": 2.9013,
611
+ "step": 7200
612
+ },
613
+ {
614
+ "epoch": 0.7141808932152816,
615
+ "grad_norm": 18.661100387573242,
616
+ "learning_rate": 4.233041020437745e-05,
617
+ "loss": 2.9428,
618
+ "step": 7300
619
+ },
620
+ {
621
+ "epoch": 0.7239641931223402,
622
+ "grad_norm": 15.535719871520996,
623
+ "learning_rate": 4.2149224525293525e-05,
624
+ "loss": 2.8582,
625
+ "step": 7400
626
+ },
627
+ {
628
+ "epoch": 0.7337474930293988,
629
+ "grad_norm": 15.3306303024292,
630
+ "learning_rate": 4.19680388462096e-05,
631
+ "loss": 2.8896,
632
+ "step": 7500
633
+ },
634
+ {
635
+ "epoch": 0.7337474930293988,
636
+ "eval_runtime": 181.8938,
637
+ "eval_samples_per_second": 112.384,
638
+ "eval_steps_per_second": 14.052,
639
+ "step": 7500
640
+ },
641
+ {
642
+ "epoch": 0.7435307929364575,
643
+ "grad_norm": 16.730344772338867,
644
+ "learning_rate": 4.1786853167125675e-05,
645
+ "loss": 2.9097,
646
+ "step": 7600
647
+ },
648
+ {
649
+ "epoch": 0.7533140928435161,
650
+ "grad_norm": 18.755483627319336,
651
+ "learning_rate": 4.1605667488041746e-05,
652
+ "loss": 2.8815,
653
+ "step": 7700
654
+ },
655
+ {
656
+ "epoch": 0.7630973927505748,
657
+ "grad_norm": 18.737581253051758,
658
+ "learning_rate": 4.1424481808957824e-05,
659
+ "loss": 2.9202,
660
+ "step": 7800
661
+ },
662
+ {
663
+ "epoch": 0.7728806926576334,
664
+ "grad_norm": 14.711681365966797,
665
+ "learning_rate": 4.1243296129873896e-05,
666
+ "loss": 2.806,
667
+ "step": 7900
668
+ },
669
+ {
670
+ "epoch": 0.7826639925646921,
671
+ "grad_norm": 17.5069580078125,
672
+ "learning_rate": 4.106211045078997e-05,
673
+ "loss": 2.8576,
674
+ "step": 8000
675
+ },
676
+ {
677
+ "epoch": 0.7826639925646921,
678
+ "eval_runtime": 181.9442,
679
+ "eval_samples_per_second": 112.353,
680
+ "eval_steps_per_second": 14.048,
681
+ "step": 8000
682
+ },
683
+ {
684
+ "epoch": 0.7924472924717507,
685
+ "grad_norm": 17.678852081298828,
686
+ "learning_rate": 4.0880924771706046e-05,
687
+ "loss": 2.8035,
688
+ "step": 8100
689
+ },
690
+ {
691
+ "epoch": 0.8022305923788093,
692
+ "grad_norm": 17.644638061523438,
693
+ "learning_rate": 4.069973909262212e-05,
694
+ "loss": 2.7958,
695
+ "step": 8200
696
+ },
697
+ {
698
+ "epoch": 0.8120138922858681,
699
+ "grad_norm": 18.377134323120117,
700
+ "learning_rate": 4.0518553413538195e-05,
701
+ "loss": 2.8055,
702
+ "step": 8300
703
+ },
704
+ {
705
+ "epoch": 0.8217971921929267,
706
+ "grad_norm": 18.026033401489258,
707
+ "learning_rate": 4.0337367734454273e-05,
708
+ "loss": 2.7334,
709
+ "step": 8400
710
+ },
711
+ {
712
+ "epoch": 0.8315804920999853,
713
+ "grad_norm": 14.77315616607666,
714
+ "learning_rate": 4.0156182055370345e-05,
715
+ "loss": 2.8082,
716
+ "step": 8500
717
+ },
718
+ {
719
+ "epoch": 0.8315804920999853,
720
+ "eval_runtime": 182.4176,
721
+ "eval_samples_per_second": 112.062,
722
+ "eval_steps_per_second": 14.012,
723
+ "step": 8500
724
+ },
725
+ {
726
+ "epoch": 0.841363792007044,
727
+ "grad_norm": 13.729479789733887,
728
+ "learning_rate": 3.997499637628642e-05,
729
+ "loss": 2.7939,
730
+ "step": 8600
731
+ },
732
+ {
733
+ "epoch": 0.8511470919141026,
734
+ "grad_norm": 16.34333610534668,
735
+ "learning_rate": 3.9793810697202494e-05,
736
+ "loss": 2.8517,
737
+ "step": 8700
738
+ },
739
+ {
740
+ "epoch": 0.8609303918211613,
741
+ "grad_norm": 22.484411239624023,
742
+ "learning_rate": 3.961262501811857e-05,
743
+ "loss": 2.776,
744
+ "step": 8800
745
+ },
746
+ {
747
+ "epoch": 0.8707136917282199,
748
+ "grad_norm": 15.922870635986328,
749
+ "learning_rate": 3.9431439339034644e-05,
750
+ "loss": 2.7909,
751
+ "step": 8900
752
+ },
753
+ {
754
+ "epoch": 0.8804969916352786,
755
+ "grad_norm": 15.06955623626709,
756
+ "learning_rate": 3.925025365995072e-05,
757
+ "loss": 2.8416,
758
+ "step": 9000
759
+ },
760
+ {
761
+ "epoch": 0.8804969916352786,
762
+ "eval_runtime": 181.9314,
763
+ "eval_samples_per_second": 112.361,
764
+ "eval_steps_per_second": 14.049,
765
+ "step": 9000
766
+ },
767
+ {
768
+ "epoch": 0.8902802915423372,
769
+ "grad_norm": 16.060428619384766,
770
+ "learning_rate": 3.9069067980866794e-05,
771
+ "loss": 2.7803,
772
+ "step": 9100
773
+ },
774
+ {
775
+ "epoch": 0.9000635914493959,
776
+ "grad_norm": 16.80124855041504,
777
+ "learning_rate": 3.888788230178287e-05,
778
+ "loss": 2.7548,
779
+ "step": 9200
780
+ },
781
+ {
782
+ "epoch": 0.9098468913564546,
783
+ "grad_norm": 16.608434677124023,
784
+ "learning_rate": 3.870669662269894e-05,
785
+ "loss": 2.8606,
786
+ "step": 9300
787
+ },
788
+ {
789
+ "epoch": 0.9196301912635132,
790
+ "grad_norm": 14.83870792388916,
791
+ "learning_rate": 3.8525510943615015e-05,
792
+ "loss": 2.7833,
793
+ "step": 9400
794
+ },
795
+ {
796
+ "epoch": 0.9294134911705718,
797
+ "grad_norm": 25.778181076049805,
798
+ "learning_rate": 3.834432526453109e-05,
799
+ "loss": 2.7434,
800
+ "step": 9500
801
+ },
802
+ {
803
+ "epoch": 0.9294134911705718,
804
+ "eval_runtime": 181.99,
805
+ "eval_samples_per_second": 112.325,
806
+ "eval_steps_per_second": 14.045,
807
+ "step": 9500
808
+ },
809
+ {
810
+ "epoch": 0.9391967910776304,
811
+ "grad_norm": 17.374011993408203,
812
+ "learning_rate": 3.8163139585447164e-05,
813
+ "loss": 2.7258,
814
+ "step": 9600
815
+ },
816
+ {
817
+ "epoch": 0.9489800909846892,
818
+ "grad_norm": 17.551128387451172,
819
+ "learning_rate": 3.798195390636324e-05,
820
+ "loss": 2.824,
821
+ "step": 9700
822
+ },
823
+ {
824
+ "epoch": 0.9587633908917478,
825
+ "grad_norm": 14.35797119140625,
826
+ "learning_rate": 3.7800768227279314e-05,
827
+ "loss": 2.745,
828
+ "step": 9800
829
+ },
830
+ {
831
+ "epoch": 0.9685466907988064,
832
+ "grad_norm": 20.098552703857422,
833
+ "learning_rate": 3.761958254819539e-05,
834
+ "loss": 2.7025,
835
+ "step": 9900
836
+ },
837
+ {
838
+ "epoch": 0.9783299907058651,
839
+ "grad_norm": 16.218109130859375,
840
+ "learning_rate": 3.743839686911147e-05,
841
+ "loss": 2.8093,
842
+ "step": 10000
843
+ },
844
+ {
845
+ "epoch": 0.9783299907058651,
846
+ "eval_runtime": 181.8987,
847
+ "eval_samples_per_second": 112.381,
848
+ "eval_steps_per_second": 14.052,
849
+ "step": 10000
850
+ },
851
+ {
852
+ "epoch": 0.9881132906129237,
853
+ "grad_norm": 17.198423385620117,
854
+ "learning_rate": 3.725721119002754e-05,
855
+ "loss": 2.7124,
856
+ "step": 10100
857
+ },
858
+ {
859
+ "epoch": 0.9978965905199824,
860
+ "grad_norm": 18.021198272705078,
861
+ "learning_rate": 3.707602551094362e-05,
862
+ "loss": 2.6922,
863
+ "step": 10200
864
+ },
865
+ {
866
+ "epoch": 1.007679890427041,
867
+ "grad_norm": 15.27678108215332,
868
+ "learning_rate": 3.689483983185969e-05,
869
+ "loss": 2.6743,
870
+ "step": 10300
871
+ },
872
+ {
873
+ "epoch": 1.0174631903340996,
874
+ "grad_norm": 16.770511627197266,
875
+ "learning_rate": 3.671365415277577e-05,
876
+ "loss": 2.857,
877
+ "step": 10400
878
+ },
879
+ {
880
+ "epoch": 1.0272464902411584,
881
+ "grad_norm": 18.810932159423828,
882
+ "learning_rate": 3.653246847369184e-05,
883
+ "loss": 2.7269,
884
+ "step": 10500
885
+ },
886
+ {
887
+ "epoch": 1.0272464902411584,
888
+ "eval_runtime": 181.8537,
889
+ "eval_samples_per_second": 112.409,
890
+ "eval_steps_per_second": 14.055,
891
+ "step": 10500
892
+ },
893
+ {
894
+ "epoch": 1.037029790148217,
895
+ "grad_norm": 18.56201171875,
896
+ "learning_rate": 3.635128279460791e-05,
897
+ "loss": 2.7325,
898
+ "step": 10600
899
+ },
900
+ {
901
+ "epoch": 1.0468130900552757,
902
+ "grad_norm": 15.063011169433594,
903
+ "learning_rate": 3.617009711552399e-05,
904
+ "loss": 2.7827,
905
+ "step": 10700
906
+ },
907
+ {
908
+ "epoch": 1.0565963899623343,
909
+ "grad_norm": 15.339439392089844,
910
+ "learning_rate": 3.598891143644006e-05,
911
+ "loss": 2.7472,
912
+ "step": 10800
913
+ },
914
+ {
915
+ "epoch": 1.066379689869393,
916
+ "grad_norm": 17.466033935546875,
917
+ "learning_rate": 3.580772575735614e-05,
918
+ "loss": 2.7859,
919
+ "step": 10900
920
+ },
921
+ {
922
+ "epoch": 1.0761629897764515,
923
+ "grad_norm": 20.727872848510742,
924
+ "learning_rate": 3.562654007827221e-05,
925
+ "loss": 2.7278,
926
+ "step": 11000
927
+ },
928
+ {
929
+ "epoch": 1.0761629897764515,
930
+ "eval_runtime": 181.8566,
931
+ "eval_samples_per_second": 112.407,
932
+ "eval_steps_per_second": 14.055,
933
+ "step": 11000
934
+ },
935
+ {
936
+ "epoch": 1.0859462896835101,
937
+ "grad_norm": 16.02055549621582,
938
+ "learning_rate": 3.544535439918829e-05,
939
+ "loss": 2.6307,
940
+ "step": 11100
941
+ },
942
+ {
943
+ "epoch": 1.095729589590569,
944
+ "grad_norm": 20.069686889648438,
945
+ "learning_rate": 3.526416872010436e-05,
946
+ "loss": 2.711,
947
+ "step": 11200
948
+ },
949
+ {
950
+ "epoch": 1.1055128894976276,
951
+ "grad_norm": 14.833261489868164,
952
+ "learning_rate": 3.508298304102044e-05,
953
+ "loss": 2.6141,
954
+ "step": 11300
955
+ },
956
+ {
957
+ "epoch": 1.1152961894046862,
958
+ "grad_norm": 14.86436653137207,
959
+ "learning_rate": 3.490179736193652e-05,
960
+ "loss": 2.6816,
961
+ "step": 11400
962
+ },
963
+ {
964
+ "epoch": 1.1250794893117448,
965
+ "grad_norm": 17.955862045288086,
966
+ "learning_rate": 3.472061168285259e-05,
967
+ "loss": 2.6924,
968
+ "step": 11500
969
+ },
970
+ {
971
+ "epoch": 1.1250794893117448,
972
+ "eval_runtime": 181.8085,
973
+ "eval_samples_per_second": 112.437,
974
+ "eval_steps_per_second": 14.059,
975
+ "step": 11500
976
+ },
977
+ {
978
+ "epoch": 1.1348627892188035,
979
+ "grad_norm": 18.360109329223633,
980
+ "learning_rate": 3.453942600376867e-05,
981
+ "loss": 2.6181,
982
+ "step": 11600
983
+ },
984
+ {
985
+ "epoch": 1.144646089125862,
986
+ "grad_norm": 17.547542572021484,
987
+ "learning_rate": 3.435824032468474e-05,
988
+ "loss": 2.6394,
989
+ "step": 11700
990
+ },
991
+ {
992
+ "epoch": 1.154429389032921,
993
+ "grad_norm": 12.194833755493164,
994
+ "learning_rate": 3.417705464560082e-05,
995
+ "loss": 2.6684,
996
+ "step": 11800
997
+ },
998
+ {
999
+ "epoch": 1.1642126889399795,
1000
+ "grad_norm": 17.095104217529297,
1001
+ "learning_rate": 3.399586896651689e-05,
1002
+ "loss": 2.6129,
1003
+ "step": 11900
1004
+ },
1005
+ {
1006
+ "epoch": 1.1739959888470382,
1007
+ "grad_norm": 20.788406372070312,
1008
+ "learning_rate": 3.381468328743296e-05,
1009
+ "loss": 2.5663,
1010
+ "step": 12000
1011
+ },
1012
+ {
1013
+ "epoch": 1.1739959888470382,
1014
+ "eval_runtime": 181.8035,
1015
+ "eval_samples_per_second": 112.44,
1016
+ "eval_steps_per_second": 14.059,
1017
+ "step": 12000
1018
+ },
1019
+ {
1020
+ "epoch": 1.1837792887540968,
1021
+ "grad_norm": 14.261167526245117,
1022
+ "learning_rate": 3.363349760834904e-05,
1023
+ "loss": 2.6544,
1024
+ "step": 12100
1025
+ },
1026
+ {
1027
+ "epoch": 1.1935625886611554,
1028
+ "grad_norm": 24.68012046813965,
1029
+ "learning_rate": 3.345231192926511e-05,
1030
+ "loss": 2.6632,
1031
+ "step": 12200
1032
+ },
1033
+ {
1034
+ "epoch": 1.203345888568214,
1035
+ "grad_norm": 16.10886573791504,
1036
+ "learning_rate": 3.327112625018119e-05,
1037
+ "loss": 2.6366,
1038
+ "step": 12300
1039
+ },
1040
+ {
1041
+ "epoch": 1.2131291884752726,
1042
+ "grad_norm": 18.038848876953125,
1043
+ "learning_rate": 3.308994057109726e-05,
1044
+ "loss": 2.6563,
1045
+ "step": 12400
1046
+ },
1047
+ {
1048
+ "epoch": 1.2229124883823315,
1049
+ "grad_norm": 17.40920639038086,
1050
+ "learning_rate": 3.290875489201334e-05,
1051
+ "loss": 2.718,
1052
+ "step": 12500
1053
+ },
1054
+ {
1055
+ "epoch": 1.2229124883823315,
1056
+ "eval_runtime": 181.9491,
1057
+ "eval_samples_per_second": 112.35,
1058
+ "eval_steps_per_second": 14.048,
1059
+ "step": 12500
1060
+ },
1061
+ {
1062
+ "epoch": 1.23269578828939,
1063
+ "grad_norm": 15.097307205200195,
1064
+ "learning_rate": 3.272756921292941e-05,
1065
+ "loss": 2.7282,
1066
+ "step": 12600
1067
+ },
1068
+ {
1069
+ "epoch": 1.2424790881964487,
1070
+ "grad_norm": 17.63008689880371,
1071
+ "learning_rate": 3.254638353384549e-05,
1072
+ "loss": 2.7104,
1073
+ "step": 12700
1074
+ },
1075
+ {
1076
+ "epoch": 1.2522623881035073,
1077
+ "grad_norm": 16.161130905151367,
1078
+ "learning_rate": 3.236519785476156e-05,
1079
+ "loss": 2.6427,
1080
+ "step": 12800
1081
+ },
1082
+ {
1083
+ "epoch": 1.262045688010566,
1084
+ "grad_norm": 18.786882400512695,
1085
+ "learning_rate": 3.218401217567764e-05,
1086
+ "loss": 2.6105,
1087
+ "step": 12900
1088
+ },
1089
+ {
1090
+ "epoch": 1.2718289879176246,
1091
+ "grad_norm": 24.145421981811523,
1092
+ "learning_rate": 3.2002826496593715e-05,
1093
+ "loss": 2.6322,
1094
+ "step": 13000
1095
+ },
1096
+ {
1097
+ "epoch": 1.2718289879176246,
1098
+ "eval_runtime": 182.5613,
1099
+ "eval_samples_per_second": 111.973,
1100
+ "eval_steps_per_second": 14.001,
1101
+ "step": 13000
1102
+ },
1103
+ {
1104
+ "epoch": 1.2816122878246832,
1105
+ "grad_norm": 15.286133766174316,
1106
+ "learning_rate": 3.1821640817509786e-05,
1107
+ "loss": 2.6465,
1108
+ "step": 13100
1109
+ },
1110
+ {
1111
+ "epoch": 1.291395587731742,
1112
+ "grad_norm": 21.22935676574707,
1113
+ "learning_rate": 3.1640455138425865e-05,
1114
+ "loss": 2.6691,
1115
+ "step": 13200
1116
+ },
1117
+ {
1118
+ "epoch": 1.3011788876388006,
1119
+ "grad_norm": 18.064428329467773,
1120
+ "learning_rate": 3.1459269459341936e-05,
1121
+ "loss": 2.5904,
1122
+ "step": 13300
1123
+ },
1124
+ {
1125
+ "epoch": 1.3109621875458592,
1126
+ "grad_norm": 14.45976448059082,
1127
+ "learning_rate": 3.127808378025801e-05,
1128
+ "loss": 2.6602,
1129
+ "step": 13400
1130
+ },
1131
+ {
1132
+ "epoch": 1.3207454874529179,
1133
+ "grad_norm": 19.72386360168457,
1134
+ "learning_rate": 3.109689810117408e-05,
1135
+ "loss": 2.6337,
1136
+ "step": 13500
1137
+ },
1138
+ {
1139
+ "epoch": 1.3207454874529179,
1140
+ "eval_runtime": 182.4053,
1141
+ "eval_samples_per_second": 112.069,
1142
+ "eval_steps_per_second": 14.013,
1143
+ "step": 13500
1144
+ },
1145
+ {
1146
+ "epoch": 1.3305287873599765,
1147
+ "grad_norm": 17.639583587646484,
1148
+ "learning_rate": 3.091571242209016e-05,
1149
+ "loss": 2.6135,
1150
+ "step": 13600
1151
+ },
1152
+ {
1153
+ "epoch": 1.340312087267035,
1154
+ "grad_norm": 19.71700096130371,
1155
+ "learning_rate": 3.0734526743006235e-05,
1156
+ "loss": 2.6252,
1157
+ "step": 13700
1158
+ },
1159
+ {
1160
+ "epoch": 1.3500953871740937,
1161
+ "grad_norm": 16.715856552124023,
1162
+ "learning_rate": 3.055334106392231e-05,
1163
+ "loss": 2.6475,
1164
+ "step": 13800
1165
+ },
1166
+ {
1167
+ "epoch": 1.3598786870811526,
1168
+ "grad_norm": 12.645075798034668,
1169
+ "learning_rate": 3.0372155384838385e-05,
1170
+ "loss": 2.6199,
1171
+ "step": 13900
1172
+ },
1173
+ {
1174
+ "epoch": 1.3696619869882112,
1175
+ "grad_norm": 20.150625228881836,
1176
+ "learning_rate": 3.0190969705754456e-05,
1177
+ "loss": 2.5567,
1178
+ "step": 14000
1179
+ },
1180
+ {
1181
+ "epoch": 1.3696619869882112,
1182
+ "eval_runtime": 181.9086,
1183
+ "eval_samples_per_second": 112.375,
1184
+ "eval_steps_per_second": 14.051,
1185
+ "step": 14000
1186
+ },
1187
+ {
1188
+ "epoch": 1.3794452868952698,
1189
+ "grad_norm": 19.111286163330078,
1190
+ "learning_rate": 3.0009784026670535e-05,
1191
+ "loss": 2.59,
1192
+ "step": 14100
1193
+ },
1194
+ {
1195
+ "epoch": 1.3892285868023284,
1196
+ "grad_norm": 17.12226104736328,
1197
+ "learning_rate": 2.9828598347586606e-05,
1198
+ "loss": 2.5913,
1199
+ "step": 14200
1200
+ },
1201
+ {
1202
+ "epoch": 1.399011886709387,
1203
+ "grad_norm": 19.741445541381836,
1204
+ "learning_rate": 2.9647412668502684e-05,
1205
+ "loss": 2.5617,
1206
+ "step": 14300
1207
+ },
1208
+ {
1209
+ "epoch": 1.4087951866164456,
1210
+ "grad_norm": 17.605525970458984,
1211
+ "learning_rate": 2.946622698941876e-05,
1212
+ "loss": 2.6077,
1213
+ "step": 14400
1214
+ },
1215
+ {
1216
+ "epoch": 1.4185784865235043,
1217
+ "grad_norm": 17.433218002319336,
1218
+ "learning_rate": 2.928504131033483e-05,
1219
+ "loss": 2.5713,
1220
+ "step": 14500
1221
+ },
1222
+ {
1223
+ "epoch": 1.4185784865235043,
1224
+ "eval_runtime": 181.9305,
1225
+ "eval_samples_per_second": 112.362,
1226
+ "eval_steps_per_second": 14.049,
1227
+ "step": 14500
1228
+ },
1229
+ {
1230
+ "epoch": 1.428361786430563,
1231
+ "grad_norm": 15.442538261413574,
1232
+ "learning_rate": 2.910385563125091e-05,
1233
+ "loss": 2.6499,
1234
+ "step": 14600
1235
+ },
1236
+ {
1237
+ "epoch": 1.4381450863376217,
1238
+ "grad_norm": 15.078730583190918,
1239
+ "learning_rate": 2.892266995216698e-05,
1240
+ "loss": 2.6517,
1241
+ "step": 14700
1242
+ },
1243
+ {
1244
+ "epoch": 1.4479283862446803,
1245
+ "grad_norm": 23.07891273498535,
1246
+ "learning_rate": 2.874148427308306e-05,
1247
+ "loss": 2.594,
1248
+ "step": 14800
1249
+ },
1250
+ {
1251
+ "epoch": 1.457711686151739,
1252
+ "grad_norm": 16.707923889160156,
1253
+ "learning_rate": 2.856029859399913e-05,
1254
+ "loss": 2.6613,
1255
+ "step": 14900
1256
+ },
1257
+ {
1258
+ "epoch": 1.4674949860587976,
1259
+ "grad_norm": 16.731164932250977,
1260
+ "learning_rate": 2.8379112914915208e-05,
1261
+ "loss": 2.5927,
1262
+ "step": 15000
1263
+ },
1264
+ {
1265
+ "epoch": 1.4674949860587976,
1266
+ "eval_runtime": 181.9649,
1267
+ "eval_samples_per_second": 112.34,
1268
+ "eval_steps_per_second": 14.047,
1269
+ "step": 15000
1270
+ },
1271
+ {
1272
+ "epoch": 1.4772782859658564,
1273
+ "grad_norm": 16.020864486694336,
1274
+ "learning_rate": 2.819792723583128e-05,
1275
+ "loss": 2.6464,
1276
+ "step": 15100
1277
+ },
1278
+ {
1279
+ "epoch": 1.4870615858729148,
1280
+ "grad_norm": 16.674760818481445,
1281
+ "learning_rate": 2.8016741556747354e-05,
1282
+ "loss": 2.5853,
1283
+ "step": 15200
1284
+ },
1285
+ {
1286
+ "epoch": 1.4968448857799737,
1287
+ "grad_norm": 16.890748977661133,
1288
+ "learning_rate": 2.7835555877663432e-05,
1289
+ "loss": 2.5748,
1290
+ "step": 15300
1291
+ },
1292
+ {
1293
+ "epoch": 1.5066281856870323,
1294
+ "grad_norm": 20.217845916748047,
1295
+ "learning_rate": 2.7654370198579504e-05,
1296
+ "loss": 2.6204,
1297
+ "step": 15400
1298
+ },
1299
+ {
1300
+ "epoch": 1.516411485594091,
1301
+ "grad_norm": 20.459087371826172,
1302
+ "learning_rate": 2.7473184519495582e-05,
1303
+ "loss": 2.6103,
1304
+ "step": 15500
1305
+ },
1306
+ {
1307
+ "epoch": 1.516411485594091,
1308
+ "eval_runtime": 181.9454,
1309
+ "eval_samples_per_second": 112.352,
1310
+ "eval_steps_per_second": 14.048,
1311
+ "step": 15500
1312
+ },
1313
+ {
1314
+ "epoch": 1.5261947855011495,
1315
+ "grad_norm": 18.207612991333008,
1316
+ "learning_rate": 2.7291998840411654e-05,
1317
+ "loss": 2.5786,
1318
+ "step": 15600
1319
+ },
1320
+ {
1321
+ "epoch": 1.5359780854082081,
1322
+ "grad_norm": 18.084758758544922,
1323
+ "learning_rate": 2.7110813161327732e-05,
1324
+ "loss": 2.6535,
1325
+ "step": 15700
1326
+ },
1327
+ {
1328
+ "epoch": 1.545761385315267,
1329
+ "grad_norm": 15.03881549835205,
1330
+ "learning_rate": 2.6929627482243803e-05,
1331
+ "loss": 2.6061,
1332
+ "step": 15800
1333
+ },
1334
+ {
1335
+ "epoch": 1.5555446852223254,
1336
+ "grad_norm": 16.99995231628418,
1337
+ "learning_rate": 2.6748441803159878e-05,
1338
+ "loss": 2.6151,
1339
+ "step": 15900
1340
+ },
1341
+ {
1342
+ "epoch": 1.5653279851293842,
1343
+ "grad_norm": 15.581089973449707,
1344
+ "learning_rate": 2.6567256124075956e-05,
1345
+ "loss": 2.6163,
1346
+ "step": 16000
1347
+ },
1348
+ {
1349
+ "epoch": 1.5653279851293842,
1350
+ "eval_runtime": 181.8152,
1351
+ "eval_samples_per_second": 112.433,
1352
+ "eval_steps_per_second": 14.058,
1353
+ "step": 16000
1354
+ },
1355
+ {
1356
+ "epoch": 1.5751112850364428,
1357
+ "grad_norm": 21.4382266998291,
1358
+ "learning_rate": 2.6386070444992028e-05,
1359
+ "loss": 2.5975,
1360
+ "step": 16100
1361
+ },
1362
+ {
1363
+ "epoch": 1.5848945849435014,
1364
+ "grad_norm": 15.874536514282227,
1365
+ "learning_rate": 2.6204884765908106e-05,
1366
+ "loss": 2.5851,
1367
+ "step": 16200
1368
+ },
1369
+ {
1370
+ "epoch": 1.59467788485056,
1371
+ "grad_norm": 17.902137756347656,
1372
+ "learning_rate": 2.6023699086824177e-05,
1373
+ "loss": 2.6027,
1374
+ "step": 16300
1375
+ },
1376
+ {
1377
+ "epoch": 1.6044611847576187,
1378
+ "grad_norm": 17.04872703552246,
1379
+ "learning_rate": 2.5842513407740255e-05,
1380
+ "loss": 2.5854,
1381
+ "step": 16400
1382
+ },
1383
+ {
1384
+ "epoch": 1.6142444846646775,
1385
+ "grad_norm": 15.406013488769531,
1386
+ "learning_rate": 2.5661327728656327e-05,
1387
+ "loss": 2.5158,
1388
+ "step": 16500
1389
+ },
1390
+ {
1391
+ "epoch": 1.6142444846646775,
1392
+ "eval_runtime": 181.8647,
1393
+ "eval_samples_per_second": 112.402,
1394
+ "eval_steps_per_second": 14.054,
1395
+ "step": 16500
1396
+ },
1397
+ {
1398
+ "epoch": 1.624027784571736,
1399
+ "grad_norm": 19.62627601623535,
1400
+ "learning_rate": 2.5480142049572402e-05,
1401
+ "loss": 2.5378,
1402
+ "step": 16600
1403
+ },
1404
+ {
1405
+ "epoch": 1.6338110844787948,
1406
+ "grad_norm": 17.825178146362305,
1407
+ "learning_rate": 2.529895637048848e-05,
1408
+ "loss": 2.6162,
1409
+ "step": 16700
1410
+ },
1411
+ {
1412
+ "epoch": 1.6435943843858534,
1413
+ "grad_norm": 15.442023277282715,
1414
+ "learning_rate": 2.511777069140455e-05,
1415
+ "loss": 2.5802,
1416
+ "step": 16800
1417
+ },
1418
+ {
1419
+ "epoch": 1.653377684292912,
1420
+ "grad_norm": 18.695241928100586,
1421
+ "learning_rate": 2.4936585012320626e-05,
1422
+ "loss": 2.585,
1423
+ "step": 16900
1424
+ },
1425
+ {
1426
+ "epoch": 1.6631609841999706,
1427
+ "grad_norm": 18.992969512939453,
1428
+ "learning_rate": 2.4755399333236704e-05,
1429
+ "loss": 2.5448,
1430
+ "step": 17000
1431
+ },
1432
+ {
1433
+ "epoch": 1.6631609841999706,
1434
+ "eval_runtime": 181.91,
1435
+ "eval_samples_per_second": 112.374,
1436
+ "eval_steps_per_second": 14.051,
1437
+ "step": 17000
1438
+ },
1439
+ {
1440
+ "epoch": 1.6729442841070292,
1441
+ "grad_norm": 19.065349578857422,
1442
+ "learning_rate": 2.457421365415278e-05,
1443
+ "loss": 2.6565,
1444
+ "step": 17100
1445
+ },
1446
+ {
1447
+ "epoch": 1.682727584014088,
1448
+ "grad_norm": 20.110734939575195,
1449
+ "learning_rate": 2.439302797506885e-05,
1450
+ "loss": 2.5519,
1451
+ "step": 17200
1452
+ },
1453
+ {
1454
+ "epoch": 1.6925108839211465,
1455
+ "grad_norm": 15.886931419372559,
1456
+ "learning_rate": 2.4211842295984925e-05,
1457
+ "loss": 2.5589,
1458
+ "step": 17300
1459
+ },
1460
+ {
1461
+ "epoch": 1.7022941838282053,
1462
+ "grad_norm": 19.213207244873047,
1463
+ "learning_rate": 2.4030656616901e-05,
1464
+ "loss": 2.5714,
1465
+ "step": 17400
1466
+ },
1467
+ {
1468
+ "epoch": 1.712077483735264,
1469
+ "grad_norm": 17.117481231689453,
1470
+ "learning_rate": 2.3849470937817075e-05,
1471
+ "loss": 2.6682,
1472
+ "step": 17500
1473
+ },
1474
+ {
1475
+ "epoch": 1.712077483735264,
1476
+ "eval_runtime": 181.766,
1477
+ "eval_samples_per_second": 112.463,
1478
+ "eval_steps_per_second": 14.062,
1479
+ "step": 17500
1480
+ },
1481
+ {
1482
+ "epoch": 1.7218607836423225,
1483
+ "grad_norm": 17.19162940979004,
1484
+ "learning_rate": 2.366828525873315e-05,
1485
+ "loss": 2.5591,
1486
+ "step": 17600
1487
+ },
1488
+ {
1489
+ "epoch": 1.7316440835493812,
1490
+ "grad_norm": 15.454411506652832,
1491
+ "learning_rate": 2.3487099579649225e-05,
1492
+ "loss": 2.469,
1493
+ "step": 17700
1494
+ },
1495
+ {
1496
+ "epoch": 1.7414273834564398,
1497
+ "grad_norm": 15.227791786193848,
1498
+ "learning_rate": 2.3305913900565303e-05,
1499
+ "loss": 2.664,
1500
+ "step": 17800
1501
+ },
1502
+ {
1503
+ "epoch": 1.7512106833634986,
1504
+ "grad_norm": 18.5739688873291,
1505
+ "learning_rate": 2.3124728221481374e-05,
1506
+ "loss": 2.5991,
1507
+ "step": 17900
1508
+ },
1509
+ {
1510
+ "epoch": 1.760993983270557,
1511
+ "grad_norm": 12.589066505432129,
1512
+ "learning_rate": 2.294354254239745e-05,
1513
+ "loss": 2.6593,
1514
+ "step": 18000
1515
+ },
1516
+ {
1517
+ "epoch": 1.760993983270557,
1518
+ "eval_runtime": 181.9699,
1519
+ "eval_samples_per_second": 112.337,
1520
+ "eval_steps_per_second": 14.046,
1521
+ "step": 18000
1522
+ },
1523
+ {
1524
+ "epoch": 1.7707772831776158,
1525
+ "grad_norm": 20.695772171020508,
1526
+ "learning_rate": 2.2762356863313524e-05,
1527
+ "loss": 2.5555,
1528
+ "step": 18100
1529
+ },
1530
+ {
1531
+ "epoch": 1.7805605830846745,
1532
+ "grad_norm": 12.731703758239746,
1533
+ "learning_rate": 2.25811711842296e-05,
1534
+ "loss": 2.4617,
1535
+ "step": 18200
1536
+ },
1537
+ {
1538
+ "epoch": 1.790343882991733,
1539
+ "grad_norm": 18.506074905395508,
1540
+ "learning_rate": 2.2399985505145674e-05,
1541
+ "loss": 2.6061,
1542
+ "step": 18300
1543
+ },
1544
+ {
1545
+ "epoch": 1.800127182898792,
1546
+ "grad_norm": 14.8694486618042,
1547
+ "learning_rate": 2.221879982606175e-05,
1548
+ "loss": 2.5779,
1549
+ "step": 18400
1550
+ },
1551
+ {
1552
+ "epoch": 1.8099104828058503,
1553
+ "grad_norm": 22.47985076904297,
1554
+ "learning_rate": 2.2037614146977827e-05,
1555
+ "loss": 2.5012,
1556
+ "step": 18500
1557
+ },
1558
+ {
1559
+ "epoch": 1.8099104828058503,
1560
+ "eval_runtime": 182.3919,
1561
+ "eval_samples_per_second": 112.077,
1562
+ "eval_steps_per_second": 14.014,
1563
+ "step": 18500
1564
+ },
1565
+ {
1566
+ "epoch": 1.8196937827129092,
1567
+ "grad_norm": 25.74334144592285,
1568
+ "learning_rate": 2.1856428467893898e-05,
1569
+ "loss": 2.5265,
1570
+ "step": 18600
1571
+ },
1572
+ {
1573
+ "epoch": 1.8294770826199676,
1574
+ "grad_norm": 18.477630615234375,
1575
+ "learning_rate": 2.1675242788809973e-05,
1576
+ "loss": 2.5555,
1577
+ "step": 18700
1578
+ },
1579
+ {
1580
+ "epoch": 1.8392603825270264,
1581
+ "grad_norm": 14.832316398620605,
1582
+ "learning_rate": 2.1494057109726048e-05,
1583
+ "loss": 2.4609,
1584
+ "step": 18800
1585
+ },
1586
+ {
1587
+ "epoch": 1.849043682434085,
1588
+ "grad_norm": 17.025096893310547,
1589
+ "learning_rate": 2.1312871430642123e-05,
1590
+ "loss": 2.5119,
1591
+ "step": 18900
1592
+ },
1593
+ {
1594
+ "epoch": 1.8588269823411436,
1595
+ "grad_norm": 16.852436065673828,
1596
+ "learning_rate": 2.1131685751558197e-05,
1597
+ "loss": 2.5369,
1598
+ "step": 19000
1599
+ },
1600
+ {
1601
+ "epoch": 1.8588269823411436,
1602
+ "eval_runtime": 181.7443,
1603
+ "eval_samples_per_second": 112.477,
1604
+ "eval_steps_per_second": 14.064,
1605
+ "step": 19000
1606
+ },
1607
+ {
1608
+ "epoch": 1.8686102822482025,
1609
+ "grad_norm": 15.160259246826172,
1610
+ "learning_rate": 2.0950500072474272e-05,
1611
+ "loss": 2.6297,
1612
+ "step": 19100
1613
+ },
1614
+ {
1615
+ "epoch": 1.8783935821552609,
1616
+ "grad_norm": 15.909671783447266,
1617
+ "learning_rate": 2.0769314393390347e-05,
1618
+ "loss": 2.4696,
1619
+ "step": 19200
1620
+ },
1621
+ {
1622
+ "epoch": 1.8881768820623197,
1623
+ "grad_norm": 14.201844215393066,
1624
+ "learning_rate": 2.0588128714306422e-05,
1625
+ "loss": 2.5653,
1626
+ "step": 19300
1627
+ },
1628
+ {
1629
+ "epoch": 1.8979601819693783,
1630
+ "grad_norm": 16.351415634155273,
1631
+ "learning_rate": 2.0406943035222497e-05,
1632
+ "loss": 2.4962,
1633
+ "step": 19400
1634
+ },
1635
+ {
1636
+ "epoch": 1.907743481876437,
1637
+ "grad_norm": 16.943771362304688,
1638
+ "learning_rate": 2.022575735613857e-05,
1639
+ "loss": 2.5091,
1640
+ "step": 19500
1641
+ },
1642
+ {
1643
+ "epoch": 1.907743481876437,
1644
+ "eval_runtime": 181.6486,
1645
+ "eval_samples_per_second": 112.536,
1646
+ "eval_steps_per_second": 14.071,
1647
+ "step": 19500
1648
+ },
1649
+ {
1650
+ "epoch": 1.9175267817834956,
1651
+ "grad_norm": 15.006349563598633,
1652
+ "learning_rate": 2.0044571677054646e-05,
1653
+ "loss": 2.5214,
1654
+ "step": 19600
1655
+ },
1656
+ {
1657
+ "epoch": 1.9273100816905542,
1658
+ "grad_norm": 17.305580139160156,
1659
+ "learning_rate": 1.986338599797072e-05,
1660
+ "loss": 2.4989,
1661
+ "step": 19700
1662
+ },
1663
+ {
1664
+ "epoch": 1.937093381597613,
1665
+ "grad_norm": 17.28044891357422,
1666
+ "learning_rate": 1.9682200318886796e-05,
1667
+ "loss": 2.4008,
1668
+ "step": 19800
1669
+ },
1670
+ {
1671
+ "epoch": 1.9468766815046714,
1672
+ "grad_norm": 18.25079917907715,
1673
+ "learning_rate": 1.950101463980287e-05,
1674
+ "loss": 2.6015,
1675
+ "step": 19900
1676
+ },
1677
+ {
1678
+ "epoch": 1.9566599814117303,
1679
+ "grad_norm": 20.741668701171875,
1680
+ "learning_rate": 1.9319828960718946e-05,
1681
+ "loss": 2.4081,
1682
+ "step": 20000
1683
+ },
1684
+ {
1685
+ "epoch": 1.9566599814117303,
1686
+ "eval_runtime": 181.7745,
1687
+ "eval_samples_per_second": 112.458,
1688
+ "eval_steps_per_second": 14.061,
1689
+ "step": 20000
1690
+ },
1691
+ {
1692
+ "epoch": 1.9664432813187889,
1693
+ "grad_norm": 16.1226863861084,
1694
+ "learning_rate": 1.913864328163502e-05,
1695
+ "loss": 2.5418,
1696
+ "step": 20100
1697
+ },
1698
+ {
1699
+ "epoch": 1.9762265812258475,
1700
+ "grad_norm": 13.914982795715332,
1701
+ "learning_rate": 1.8957457602551095e-05,
1702
+ "loss": 2.5248,
1703
+ "step": 20200
1704
+ },
1705
+ {
1706
+ "epoch": 1.986009881132906,
1707
+ "grad_norm": 15.072690963745117,
1708
+ "learning_rate": 1.877627192346717e-05,
1709
+ "loss": 2.5488,
1710
+ "step": 20300
1711
+ },
1712
+ {
1713
+ "epoch": 1.9957931810399647,
1714
+ "grad_norm": 15.510763168334961,
1715
+ "learning_rate": 1.8595086244383245e-05,
1716
+ "loss": 2.4605,
1717
+ "step": 20400
1718
+ },
1719
+ {
1720
+ "epoch": 2.0055764809470236,
1721
+ "grad_norm": 18.463842391967773,
1722
+ "learning_rate": 1.841390056529932e-05,
1723
+ "loss": 2.522,
1724
+ "step": 20500
1725
+ },
1726
+ {
1727
+ "epoch": 2.0055764809470236,
1728
+ "eval_runtime": 182.07,
1729
+ "eval_samples_per_second": 112.276,
1730
+ "eval_steps_per_second": 14.039,
1731
+ "step": 20500
1732
+ },
1733
+ {
1734
+ "epoch": 2.015359780854082,
1735
+ "grad_norm": 16.670269012451172,
1736
+ "learning_rate": 1.8232714886215394e-05,
1737
+ "loss": 2.5585,
1738
+ "step": 20600
1739
+ },
1740
+ {
1741
+ "epoch": 2.025143080761141,
1742
+ "grad_norm": 20.60368537902832,
1743
+ "learning_rate": 1.805152920713147e-05,
1744
+ "loss": 2.5381,
1745
+ "step": 20700
1746
+ },
1747
+ {
1748
+ "epoch": 2.034926380668199,
1749
+ "grad_norm": 15.686981201171875,
1750
+ "learning_rate": 1.7870343528047544e-05,
1751
+ "loss": 2.5721,
1752
+ "step": 20800
1753
+ },
1754
+ {
1755
+ "epoch": 2.044709680575258,
1756
+ "grad_norm": 14.691718101501465,
1757
+ "learning_rate": 1.768915784896362e-05,
1758
+ "loss": 2.5187,
1759
+ "step": 20900
1760
+ },
1761
+ {
1762
+ "epoch": 2.054492980482317,
1763
+ "grad_norm": 16.31734848022461,
1764
+ "learning_rate": 1.7507972169879694e-05,
1765
+ "loss": 2.5202,
1766
+ "step": 21000
1767
+ },
1768
+ {
1769
+ "epoch": 2.054492980482317,
1770
+ "eval_runtime": 181.9896,
1771
+ "eval_samples_per_second": 112.325,
1772
+ "eval_steps_per_second": 14.045,
1773
+ "step": 21000
1774
+ },
1775
+ {
1776
+ "epoch": 2.0642762803893753,
1777
+ "grad_norm": 12.698554992675781,
1778
+ "learning_rate": 1.732678649079577e-05,
1779
+ "loss": 2.4228,
1780
+ "step": 21100
1781
+ },
1782
+ {
1783
+ "epoch": 2.074059580296434,
1784
+ "grad_norm": 16.34201431274414,
1785
+ "learning_rate": 1.7145600811711843e-05,
1786
+ "loss": 2.3963,
1787
+ "step": 21200
1788
+ },
1789
+ {
1790
+ "epoch": 2.0838428802034925,
1791
+ "grad_norm": 16.52840232849121,
1792
+ "learning_rate": 1.6964415132627918e-05,
1793
+ "loss": 2.4759,
1794
+ "step": 21300
1795
+ },
1796
+ {
1797
+ "epoch": 2.0936261801105513,
1798
+ "grad_norm": 14.856452941894531,
1799
+ "learning_rate": 1.6783229453543993e-05,
1800
+ "loss": 2.4675,
1801
+ "step": 21400
1802
+ },
1803
+ {
1804
+ "epoch": 2.1034094800176097,
1805
+ "grad_norm": 19.68895721435547,
1806
+ "learning_rate": 1.6602043774460068e-05,
1807
+ "loss": 2.5324,
1808
+ "step": 21500
1809
+ },
1810
+ {
1811
+ "epoch": 2.1034094800176097,
1812
+ "eval_runtime": 182.1877,
1813
+ "eval_samples_per_second": 112.203,
1814
+ "eval_steps_per_second": 14.029,
1815
+ "step": 21500
1816
+ },
1817
+ {
1818
+ "epoch": 2.1131927799246686,
1819
+ "grad_norm": 23.248056411743164,
1820
+ "learning_rate": 1.6420858095376143e-05,
1821
+ "loss": 2.5231,
1822
+ "step": 21600
1823
+ },
1824
+ {
1825
+ "epoch": 2.1229760798317274,
1826
+ "grad_norm": 25.471004486083984,
1827
+ "learning_rate": 1.6239672416292217e-05,
1828
+ "loss": 2.5871,
1829
+ "step": 21700
1830
+ },
1831
+ {
1832
+ "epoch": 2.132759379738786,
1833
+ "grad_norm": 17.794851303100586,
1834
+ "learning_rate": 1.6058486737208292e-05,
1835
+ "loss": 2.5008,
1836
+ "step": 21800
1837
+ },
1838
+ {
1839
+ "epoch": 2.1425426796458447,
1840
+ "grad_norm": 15.450346946716309,
1841
+ "learning_rate": 1.5877301058124367e-05,
1842
+ "loss": 2.4194,
1843
+ "step": 21900
1844
+ },
1845
+ {
1846
+ "epoch": 2.152325979552903,
1847
+ "grad_norm": 13.243645668029785,
1848
+ "learning_rate": 1.5696115379040442e-05,
1849
+ "loss": 2.5018,
1850
+ "step": 22000
1851
+ },
1852
+ {
1853
+ "epoch": 2.152325979552903,
1854
+ "eval_runtime": 181.9841,
1855
+ "eval_samples_per_second": 112.328,
1856
+ "eval_steps_per_second": 14.045,
1857
+ "step": 22000
1858
+ },
1859
+ {
1860
+ "epoch": 2.162109279459962,
1861
+ "grad_norm": 16.996198654174805,
1862
+ "learning_rate": 1.5514929699956517e-05,
1863
+ "loss": 2.4492,
1864
+ "step": 22100
1865
+ },
1866
+ {
1867
+ "epoch": 2.1718925793670203,
1868
+ "grad_norm": 20.05558967590332,
1869
+ "learning_rate": 1.5333744020872588e-05,
1870
+ "loss": 2.489,
1871
+ "step": 22200
1872
+ },
1873
+ {
1874
+ "epoch": 2.181675879274079,
1875
+ "grad_norm": 15.66326904296875,
1876
+ "learning_rate": 1.5152558341788666e-05,
1877
+ "loss": 2.5089,
1878
+ "step": 22300
1879
+ },
1880
+ {
1881
+ "epoch": 2.191459179181138,
1882
+ "grad_norm": 17.83564567565918,
1883
+ "learning_rate": 1.4971372662704741e-05,
1884
+ "loss": 2.4945,
1885
+ "step": 22400
1886
+ },
1887
+ {
1888
+ "epoch": 2.2012424790881964,
1889
+ "grad_norm": 21.466899871826172,
1890
+ "learning_rate": 1.4790186983620816e-05,
1891
+ "loss": 2.5467,
1892
+ "step": 22500
1893
+ },
1894
+ {
1895
+ "epoch": 2.2012424790881964,
1896
+ "eval_runtime": 182.8328,
1897
+ "eval_samples_per_second": 111.807,
1898
+ "eval_steps_per_second": 13.98,
1899
+ "step": 22500
1900
+ },
1901
+ {
1902
+ "epoch": 2.211025778995255,
1903
+ "grad_norm": 17.91064453125,
1904
+ "learning_rate": 1.4609001304536891e-05,
1905
+ "loss": 2.5144,
1906
+ "step": 22600
1907
+ },
1908
+ {
1909
+ "epoch": 2.2208090789023136,
1910
+ "grad_norm": 17.678396224975586,
1911
+ "learning_rate": 1.4427815625452964e-05,
1912
+ "loss": 2.5018,
1913
+ "step": 22700
1914
+ },
1915
+ {
1916
+ "epoch": 2.2305923788093724,
1917
+ "grad_norm": 17.510461807250977,
1918
+ "learning_rate": 1.4246629946369039e-05,
1919
+ "loss": 2.4228,
1920
+ "step": 22800
1921
+ },
1922
+ {
1923
+ "epoch": 2.240375678716431,
1924
+ "grad_norm": 24.923967361450195,
1925
+ "learning_rate": 1.4065444267285114e-05,
1926
+ "loss": 2.5249,
1927
+ "step": 22900
1928
+ },
1929
+ {
1930
+ "epoch": 2.2501589786234897,
1931
+ "grad_norm": 17.82384490966797,
1932
+ "learning_rate": 1.388425858820119e-05,
1933
+ "loss": 2.4282,
1934
+ "step": 23000
1935
+ },
1936
+ {
1937
+ "epoch": 2.2501589786234897,
1938
+ "eval_runtime": 182.0459,
1939
+ "eval_samples_per_second": 112.29,
1940
+ "eval_steps_per_second": 14.04,
1941
+ "step": 23000
1942
+ },
1943
+ {
1944
+ "epoch": 2.2599422785305485,
1945
+ "grad_norm": 16.13028335571289,
1946
+ "learning_rate": 1.3703072909117265e-05,
1947
+ "loss": 2.4472,
1948
+ "step": 23100
1949
+ },
1950
+ {
1951
+ "epoch": 2.269725578437607,
1952
+ "grad_norm": 15.137242317199707,
1953
+ "learning_rate": 1.352188723003334e-05,
1954
+ "loss": 2.5985,
1955
+ "step": 23200
1956
+ },
1957
+ {
1958
+ "epoch": 2.2795088783446658,
1959
+ "grad_norm": 16.187530517578125,
1960
+ "learning_rate": 1.3340701550949415e-05,
1961
+ "loss": 2.4862,
1962
+ "step": 23300
1963
+ },
1964
+ {
1965
+ "epoch": 2.289292178251724,
1966
+ "grad_norm": 18.84433937072754,
1967
+ "learning_rate": 1.3159515871865488e-05,
1968
+ "loss": 2.516,
1969
+ "step": 23400
1970
+ },
1971
+ {
1972
+ "epoch": 2.299075478158783,
1973
+ "grad_norm": 20.209121704101562,
1974
+ "learning_rate": 1.2978330192781563e-05,
1975
+ "loss": 2.5031,
1976
+ "step": 23500
1977
+ },
1978
+ {
1979
+ "epoch": 2.299075478158783,
1980
+ "eval_runtime": 181.9806,
1981
+ "eval_samples_per_second": 112.331,
1982
+ "eval_steps_per_second": 14.045,
1983
+ "step": 23500
1984
+ },
1985
+ {
1986
+ "epoch": 2.308858778065842,
1987
+ "grad_norm": 67.4502182006836,
1988
+ "learning_rate": 1.2797144513697637e-05,
1989
+ "loss": 2.4491,
1990
+ "step": 23600
1991
+ },
1992
+ {
1993
+ "epoch": 2.3186420779729002,
1994
+ "grad_norm": 14.940401077270508,
1995
+ "learning_rate": 1.2615958834613712e-05,
1996
+ "loss": 2.5669,
1997
+ "step": 23700
1998
+ },
1999
+ {
2000
+ "epoch": 2.328425377879959,
2001
+ "grad_norm": 16.591793060302734,
2002
+ "learning_rate": 1.2434773155529787e-05,
2003
+ "loss": 2.4565,
2004
+ "step": 23800
2005
+ },
2006
+ {
2007
+ "epoch": 2.3382086777870175,
2008
+ "grad_norm": 16.798791885375977,
2009
+ "learning_rate": 1.2253587476445862e-05,
2010
+ "loss": 2.4046,
2011
+ "step": 23900
2012
+ },
2013
+ {
2014
+ "epoch": 2.3479919776940763,
2015
+ "grad_norm": 17.712255477905273,
2016
+ "learning_rate": 1.2072401797361937e-05,
2017
+ "loss": 2.4453,
2018
+ "step": 24000
2019
+ },
2020
+ {
2021
+ "epoch": 2.3479919776940763,
2022
+ "eval_runtime": 182.0401,
2023
+ "eval_samples_per_second": 112.294,
2024
+ "eval_steps_per_second": 14.041,
2025
+ "step": 24000
2026
+ },
2027
+ {
2028
+ "epoch": 2.3577752776011347,
2029
+ "grad_norm": 18.64284324645996,
2030
+ "learning_rate": 1.1891216118278011e-05,
2031
+ "loss": 2.3973,
2032
+ "step": 24100
2033
+ },
2034
+ {
2035
+ "epoch": 2.3675585775081935,
2036
+ "grad_norm": 18.185895919799805,
2037
+ "learning_rate": 1.1710030439194086e-05,
2038
+ "loss": 2.5045,
2039
+ "step": 24200
2040
+ },
2041
+ {
2042
+ "epoch": 2.377341877415252,
2043
+ "grad_norm": 23.201522827148438,
2044
+ "learning_rate": 1.1528844760110163e-05,
2045
+ "loss": 2.5402,
2046
+ "step": 24300
2047
+ },
2048
+ {
2049
+ "epoch": 2.3871251773223108,
2050
+ "grad_norm": 21.606412887573242,
2051
+ "learning_rate": 1.1347659081026236e-05,
2052
+ "loss": 2.4285,
2053
+ "step": 24400
2054
+ },
2055
+ {
2056
+ "epoch": 2.3969084772293696,
2057
+ "grad_norm": 16.318761825561523,
2058
+ "learning_rate": 1.116647340194231e-05,
2059
+ "loss": 2.5509,
2060
+ "step": 24500
2061
+ },
2062
+ {
2063
+ "epoch": 2.3969084772293696,
2064
+ "eval_runtime": 182.0431,
2065
+ "eval_samples_per_second": 112.292,
2066
+ "eval_steps_per_second": 14.041,
2067
+ "step": 24500
2068
+ },
2069
+ {
2070
+ "epoch": 2.406691777136428,
2071
+ "grad_norm": 17.779014587402344,
2072
+ "learning_rate": 1.0985287722858386e-05,
2073
+ "loss": 2.4245,
2074
+ "step": 24600
2075
+ },
2076
+ {
2077
+ "epoch": 2.416475077043487,
2078
+ "grad_norm": 18.44321060180664,
2079
+ "learning_rate": 1.080410204377446e-05,
2080
+ "loss": 2.5223,
2081
+ "step": 24700
2082
+ },
2083
+ {
2084
+ "epoch": 2.4262583769505452,
2085
+ "grad_norm": 24.017047882080078,
2086
+ "learning_rate": 1.0622916364690535e-05,
2087
+ "loss": 2.4846,
2088
+ "step": 24800
2089
+ },
2090
+ {
2091
+ "epoch": 2.436041676857604,
2092
+ "grad_norm": 14.89560604095459,
2093
+ "learning_rate": 1.044173068560661e-05,
2094
+ "loss": 2.5922,
2095
+ "step": 24900
2096
+ },
2097
+ {
2098
+ "epoch": 2.445824976764663,
2099
+ "grad_norm": 15.532561302185059,
2100
+ "learning_rate": 1.0260545006522685e-05,
2101
+ "loss": 2.3976,
2102
+ "step": 25000
2103
+ },
2104
+ {
2105
+ "epoch": 2.445824976764663,
2106
+ "eval_runtime": 182.1033,
2107
+ "eval_samples_per_second": 112.255,
2108
+ "eval_steps_per_second": 14.036,
2109
+ "step": 25000
2110
+ },
2111
+ {
2112
+ "epoch": 2.4556082766717213,
2113
+ "grad_norm": 18.041282653808594,
2114
+ "learning_rate": 1.007935932743876e-05,
2115
+ "loss": 2.4731,
2116
+ "step": 25100
2117
+ },
2118
+ {
2119
+ "epoch": 2.46539157657878,
2120
+ "grad_norm": 13.40858268737793,
2121
+ "learning_rate": 9.898173648354834e-06,
2122
+ "loss": 2.4838,
2123
+ "step": 25200
2124
+ },
2125
+ {
2126
+ "epoch": 2.4751748764858386,
2127
+ "grad_norm": 17.450841903686523,
2128
+ "learning_rate": 9.71698796927091e-06,
2129
+ "loss": 2.3999,
2130
+ "step": 25300
2131
+ },
2132
+ {
2133
+ "epoch": 2.4849581763928974,
2134
+ "grad_norm": 17.556467056274414,
2135
+ "learning_rate": 9.535802290186984e-06,
2136
+ "loss": 2.3867,
2137
+ "step": 25400
2138
+ },
2139
+ {
2140
+ "epoch": 2.494741476299956,
2141
+ "grad_norm": 18.578310012817383,
2142
+ "learning_rate": 9.354616611103059e-06,
2143
+ "loss": 2.4546,
2144
+ "step": 25500
2145
+ },
2146
+ {
2147
+ "epoch": 2.494741476299956,
2148
+ "eval_runtime": 182.0338,
2149
+ "eval_samples_per_second": 112.298,
2150
+ "eval_steps_per_second": 14.041,
2151
+ "step": 25500
2152
+ },
2153
+ {
2154
+ "epoch": 2.5045247762070146,
2155
+ "grad_norm": 14.936469078063965,
2156
+ "learning_rate": 9.173430932019134e-06,
2157
+ "loss": 2.5562,
2158
+ "step": 25600
2159
+ },
2160
+ {
2161
+ "epoch": 2.514308076114073,
2162
+ "grad_norm": 17.527040481567383,
2163
+ "learning_rate": 8.992245252935209e-06,
2164
+ "loss": 2.4008,
2165
+ "step": 25700
2166
+ },
2167
+ {
2168
+ "epoch": 2.524091376021132,
2169
+ "grad_norm": 12.91336727142334,
2170
+ "learning_rate": 8.811059573851283e-06,
2171
+ "loss": 2.4655,
2172
+ "step": 25800
2173
+ },
2174
+ {
2175
+ "epoch": 2.5338746759281907,
2176
+ "grad_norm": 15.168461799621582,
2177
+ "learning_rate": 8.629873894767358e-06,
2178
+ "loss": 2.4468,
2179
+ "step": 25900
2180
+ },
2181
+ {
2182
+ "epoch": 2.543657975835249,
2183
+ "grad_norm": 17.5390682220459,
2184
+ "learning_rate": 8.448688215683433e-06,
2185
+ "loss": 2.4836,
2186
+ "step": 26000
2187
+ },
2188
+ {
2189
+ "epoch": 2.543657975835249,
2190
+ "eval_runtime": 182.1148,
2191
+ "eval_samples_per_second": 112.248,
2192
+ "eval_steps_per_second": 14.035,
2193
+ "step": 26000
2194
+ },
2195
+ {
2196
+ "epoch": 2.553441275742308,
2197
+ "grad_norm": 15.126510620117188,
2198
+ "learning_rate": 8.267502536599508e-06,
2199
+ "loss": 2.387,
2200
+ "step": 26100
2201
+ },
2202
+ {
2203
+ "epoch": 2.5632245756493663,
2204
+ "grad_norm": 15.374293327331543,
2205
+ "learning_rate": 8.086316857515583e-06,
2206
+ "loss": 2.3652,
2207
+ "step": 26200
2208
+ },
2209
+ {
2210
+ "epoch": 2.573007875556425,
2211
+ "grad_norm": 15.498108863830566,
2212
+ "learning_rate": 7.905131178431657e-06,
2213
+ "loss": 2.4749,
2214
+ "step": 26300
2215
+ },
2216
+ {
2217
+ "epoch": 2.582791175463484,
2218
+ "grad_norm": 16.221315383911133,
2219
+ "learning_rate": 7.723945499347732e-06,
2220
+ "loss": 2.4567,
2221
+ "step": 26400
2222
+ },
2223
+ {
2224
+ "epoch": 2.5925744753705424,
2225
+ "grad_norm": 18.839122772216797,
2226
+ "learning_rate": 7.542759820263806e-06,
2227
+ "loss": 2.3554,
2228
+ "step": 26500
2229
+ },
2230
+ {
2231
+ "epoch": 2.5925744753705424,
2232
+ "eval_runtime": 181.9597,
2233
+ "eval_samples_per_second": 112.344,
2234
+ "eval_steps_per_second": 14.047,
2235
+ "step": 26500
2236
+ },
2237
+ {
2238
+ "epoch": 2.6023577752776013,
2239
+ "grad_norm": 22.626708984375,
2240
+ "learning_rate": 7.361574141179882e-06,
2241
+ "loss": 2.502,
2242
+ "step": 26600
2243
+ },
2244
+ {
2245
+ "epoch": 2.6121410751846597,
2246
+ "grad_norm": 16.519880294799805,
2247
+ "learning_rate": 7.180388462095957e-06,
2248
+ "loss": 2.5034,
2249
+ "step": 26700
2250
+ },
2251
+ {
2252
+ "epoch": 2.6219243750917185,
2253
+ "grad_norm": 27.421489715576172,
2254
+ "learning_rate": 6.999202783012031e-06,
2255
+ "loss": 2.5276,
2256
+ "step": 26800
2257
+ },
2258
+ {
2259
+ "epoch": 2.6317076749987773,
2260
+ "grad_norm": 15.274630546569824,
2261
+ "learning_rate": 6.8180171039281055e-06,
2262
+ "loss": 2.4121,
2263
+ "step": 26900
2264
+ },
2265
+ {
2266
+ "epoch": 2.6414909749058357,
2267
+ "grad_norm": 15.751582145690918,
2268
+ "learning_rate": 6.636831424844181e-06,
2269
+ "loss": 2.5799,
2270
+ "step": 27000
2271
+ },
2272
+ {
2273
+ "epoch": 2.6414909749058357,
2274
+ "eval_runtime": 182.0873,
2275
+ "eval_samples_per_second": 112.265,
2276
+ "eval_steps_per_second": 14.037,
2277
+ "step": 27000
2278
+ },
2279
+ {
2280
+ "epoch": 2.651274274812894,
2281
+ "grad_norm": 16.674850463867188,
2282
+ "learning_rate": 6.455645745760255e-06,
2283
+ "loss": 2.3872,
2284
+ "step": 27100
2285
+ },
2286
+ {
2287
+ "epoch": 2.661057574719953,
2288
+ "grad_norm": 12.62803840637207,
2289
+ "learning_rate": 6.27446006667633e-06,
2290
+ "loss": 2.4,
2291
+ "step": 27200
2292
+ },
2293
+ {
2294
+ "epoch": 2.670840874627012,
2295
+ "grad_norm": 18.055158615112305,
2296
+ "learning_rate": 6.093274387592405e-06,
2297
+ "loss": 2.4681,
2298
+ "step": 27300
2299
+ },
2300
+ {
2301
+ "epoch": 2.68062417453407,
2302
+ "grad_norm": 17.21278190612793,
2303
+ "learning_rate": 5.91208870850848e-06,
2304
+ "loss": 2.5441,
2305
+ "step": 27400
2306
+ },
2307
+ {
2308
+ "epoch": 2.690407474441129,
2309
+ "grad_norm": 20.945236206054688,
2310
+ "learning_rate": 5.7309030294245544e-06,
2311
+ "loss": 2.4388,
2312
+ "step": 27500
2313
+ },
2314
+ {
2315
+ "epoch": 2.690407474441129,
2316
+ "eval_runtime": 182.1279,
2317
+ "eval_samples_per_second": 112.24,
2318
+ "eval_steps_per_second": 14.034,
2319
+ "step": 27500
2320
+ },
2321
+ {
2322
+ "epoch": 2.7001907743481874,
2323
+ "grad_norm": 23.483661651611328,
2324
+ "learning_rate": 5.549717350340629e-06,
2325
+ "loss": 2.4589,
2326
+ "step": 27600
2327
+ },
2328
+ {
2329
+ "epoch": 2.7099740742552463,
2330
+ "grad_norm": 17.954036712646484,
2331
+ "learning_rate": 5.368531671256704e-06,
2332
+ "loss": 2.4477,
2333
+ "step": 27700
2334
+ },
2335
+ {
2336
+ "epoch": 2.719757374162305,
2337
+ "grad_norm": 16.187314987182617,
2338
+ "learning_rate": 5.187345992172779e-06,
2339
+ "loss": 2.4967,
2340
+ "step": 27800
2341
+ },
2342
+ {
2343
+ "epoch": 2.7295406740693635,
2344
+ "grad_norm": 14.324910163879395,
2345
+ "learning_rate": 5.006160313088854e-06,
2346
+ "loss": 2.3921,
2347
+ "step": 27900
2348
+ },
2349
+ {
2350
+ "epoch": 2.7393239739764224,
2351
+ "grad_norm": 20.81557846069336,
2352
+ "learning_rate": 4.8249746340049285e-06,
2353
+ "loss": 2.5201,
2354
+ "step": 28000
2355
+ },
2356
+ {
2357
+ "epoch": 2.7393239739764224,
2358
+ "eval_runtime": 182.146,
2359
+ "eval_samples_per_second": 112.229,
2360
+ "eval_steps_per_second": 14.033,
2361
+ "step": 28000
2362
+ },
2363
+ {
2364
+ "epoch": 2.7491072738834808,
2365
+ "grad_norm": 18.682844161987305,
2366
+ "learning_rate": 4.643788954921003e-06,
2367
+ "loss": 2.4325,
2368
+ "step": 28100
2369
+ },
2370
+ {
2371
+ "epoch": 2.7588905737905396,
2372
+ "grad_norm": 16.227272033691406,
2373
+ "learning_rate": 4.462603275837078e-06,
2374
+ "loss": 2.3864,
2375
+ "step": 28200
2376
+ },
2377
+ {
2378
+ "epoch": 2.7686738736975984,
2379
+ "grad_norm": 16.20302963256836,
2380
+ "learning_rate": 4.281417596753152e-06,
2381
+ "loss": 2.5296,
2382
+ "step": 28300
2383
+ },
2384
+ {
2385
+ "epoch": 2.778457173604657,
2386
+ "grad_norm": 18.634096145629883,
2387
+ "learning_rate": 4.100231917669228e-06,
2388
+ "loss": 2.4514,
2389
+ "step": 28400
2390
+ },
2391
+ {
2392
+ "epoch": 2.7882404735117157,
2393
+ "grad_norm": 13.040008544921875,
2394
+ "learning_rate": 3.919046238585303e-06,
2395
+ "loss": 2.3661,
2396
+ "step": 28500
2397
+ },
2398
+ {
2399
+ "epoch": 2.7882404735117157,
2400
+ "eval_runtime": 181.9164,
2401
+ "eval_samples_per_second": 112.37,
2402
+ "eval_steps_per_second": 14.05,
2403
+ "step": 28500
2404
+ },
2405
+ {
2406
+ "epoch": 2.798023773418774,
2407
+ "grad_norm": 14.142943382263184,
2408
+ "learning_rate": 3.737860559501377e-06,
2409
+ "loss": 2.5074,
2410
+ "step": 28600
2411
+ },
2412
+ {
2413
+ "epoch": 2.807807073325833,
2414
+ "grad_norm": 17.934324264526367,
2415
+ "learning_rate": 3.5566748804174523e-06,
2416
+ "loss": 2.4224,
2417
+ "step": 28700
2418
+ },
2419
+ {
2420
+ "epoch": 2.8175903732328913,
2421
+ "grad_norm": 14.450194358825684,
2422
+ "learning_rate": 3.3754892013335267e-06,
2423
+ "loss": 2.4949,
2424
+ "step": 28800
2425
+ },
2426
+ {
2427
+ "epoch": 2.82737367313995,
2428
+ "grad_norm": 17.746837615966797,
2429
+ "learning_rate": 3.194303522249602e-06,
2430
+ "loss": 2.4153,
2431
+ "step": 28900
2432
+ },
2433
+ {
2434
+ "epoch": 2.8371569730470085,
2435
+ "grad_norm": 13.962541580200195,
2436
+ "learning_rate": 3.0131178431656763e-06,
2437
+ "loss": 2.4804,
2438
+ "step": 29000
2439
+ },
2440
+ {
2441
+ "epoch": 2.8371569730470085,
2442
+ "eval_runtime": 182.0262,
2443
+ "eval_samples_per_second": 112.303,
2444
+ "eval_steps_per_second": 14.042,
2445
+ "step": 29000
2446
+ },
2447
+ {
2448
+ "epoch": 2.8469402729540674,
2449
+ "grad_norm": 16.669286727905273,
2450
+ "learning_rate": 2.831932164081751e-06,
2451
+ "loss": 2.5397,
2452
+ "step": 29100
2453
+ },
2454
+ {
2455
+ "epoch": 2.856723572861126,
2456
+ "grad_norm": 15.421733856201172,
2457
+ "learning_rate": 2.650746484997826e-06,
2458
+ "loss": 2.4175,
2459
+ "step": 29200
2460
+ },
2461
+ {
2462
+ "epoch": 2.8665068727681846,
2463
+ "grad_norm": 14.135702133178711,
2464
+ "learning_rate": 2.4695608059139007e-06,
2465
+ "loss": 2.5069,
2466
+ "step": 29300
2467
+ },
2468
+ {
2469
+ "epoch": 2.8762901726752435,
2470
+ "grad_norm": 17.41412925720215,
2471
+ "learning_rate": 2.2883751268299756e-06,
2472
+ "loss": 2.3997,
2473
+ "step": 29400
2474
+ },
2475
+ {
2476
+ "epoch": 2.886073472582302,
2477
+ "grad_norm": 14.824533462524414,
2478
+ "learning_rate": 2.1071894477460504e-06,
2479
+ "loss": 2.3945,
2480
+ "step": 29500
2481
+ },
2482
+ {
2483
+ "epoch": 2.886073472582302,
2484
+ "eval_runtime": 181.9299,
2485
+ "eval_samples_per_second": 112.362,
2486
+ "eval_steps_per_second": 14.049,
2487
+ "step": 29500
2488
+ },
2489
+ {
2490
+ "epoch": 2.8958567724893607,
2491
+ "grad_norm": 27.31865119934082,
2492
+ "learning_rate": 1.926003768662125e-06,
2493
+ "loss": 2.45,
2494
+ "step": 29600
2495
+ },
2496
+ {
2497
+ "epoch": 2.9056400723964195,
2498
+ "grad_norm": 18.966655731201172,
2499
+ "learning_rate": 1.7448180895781998e-06,
2500
+ "loss": 2.3916,
2501
+ "step": 29700
2502
+ },
2503
+ {
2504
+ "epoch": 2.915423372303478,
2505
+ "grad_norm": 18.538440704345703,
2506
+ "learning_rate": 1.5636324104942746e-06,
2507
+ "loss": 2.4625,
2508
+ "step": 29800
2509
+ },
2510
+ {
2511
+ "epoch": 2.9252066722105368,
2512
+ "grad_norm": 21.757272720336914,
2513
+ "learning_rate": 1.3824467314103494e-06,
2514
+ "loss": 2.3722,
2515
+ "step": 29900
2516
+ },
2517
+ {
2518
+ "epoch": 2.934989972117595,
2519
+ "grad_norm": 16.907358169555664,
2520
+ "learning_rate": 1.201261052326424e-06,
2521
+ "loss": 2.464,
2522
+ "step": 30000
2523
+ },
2524
+ {
2525
+ "epoch": 2.934989972117595,
2526
+ "eval_runtime": 181.9148,
2527
+ "eval_samples_per_second": 112.371,
2528
+ "eval_steps_per_second": 14.051,
2529
+ "step": 30000
2530
+ },
2531
+ {
2532
+ "epoch": 2.944773272024654,
2533
+ "grad_norm": 13.88399600982666,
2534
+ "learning_rate": 1.0200753732424989e-06,
2535
+ "loss": 2.5005,
2536
+ "step": 30100
2537
+ },
2538
+ {
2539
+ "epoch": 2.954556571931713,
2540
+ "grad_norm": 19.77507781982422,
2541
+ "learning_rate": 8.388896941585737e-07,
2542
+ "loss": 2.3829,
2543
+ "step": 30200
2544
+ },
2545
+ {
2546
+ "epoch": 2.9643398718387712,
2547
+ "grad_norm": 16.535932540893555,
2548
+ "learning_rate": 6.577040150746485e-07,
2549
+ "loss": 2.4788,
2550
+ "step": 30300
2551
+ },
2552
+ {
2553
+ "epoch": 2.9741231717458296,
2554
+ "grad_norm": 15.027000427246094,
2555
+ "learning_rate": 4.765183359907233e-07,
2556
+ "loss": 2.5007,
2557
+ "step": 30400
2558
+ },
2559
+ {
2560
+ "epoch": 2.9839064716528885,
2561
+ "grad_norm": 14.9392671585083,
2562
+ "learning_rate": 2.953326569067981e-07,
2563
+ "loss": 2.4847,
2564
+ "step": 30500
2565
+ },
2566
+ {
2567
+ "epoch": 2.9839064716528885,
2568
+ "eval_runtime": 181.9853,
2569
+ "eval_samples_per_second": 112.328,
2570
+ "eval_steps_per_second": 14.045,
2571
+ "step": 30500
2572
+ }
2573
+ ],
2574
+ "logging_steps": 100,
2575
+ "max_steps": 30663,
2576
+ "num_input_tokens_seen": 0,
2577
+ "num_train_epochs": 3,
2578
+ "save_steps": 500,
2579
+ "stateful_callbacks": {
2580
+ "TrainerControl": {
2581
+ "args": {
2582
+ "should_epoch_stop": false,
2583
+ "should_evaluate": false,
2584
+ "should_log": false,
2585
+ "should_save": true,
2586
+ "should_training_stop": false
2587
+ },
2588
+ "attributes": {}
2589
+ }
2590
+ },
2591
+ "total_flos": 1.0644527086729788e+16,
2592
+ "train_batch_size": 8,
2593
+ "trial_name": null,
2594
+ "trial_params": null
2595
+ }
train_data/muril_ch_domain/checkpoint-30500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c39b3134e7e5628432a425a5873f74f59d631bf591a12adab52f3ba906ae6906
3
+ size 5304
train_data/muril_ch_domain/checkpoint-30663/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/muril-base-cased
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
train_data/muril_ch_domain/checkpoint-30663/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "BertForMaskedLM",
5
+ "parent_library": "transformers.models.bert.modeling_bert"
6
+ },
7
+ "base_model_name_or_path": "google/muril-base-cased",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 128,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 64,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "key",
27
+ "query",
28
+ "value",
29
+ "dense"
30
+ ],
31
+ "task_type": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
train_data/muril_ch_domain/checkpoint-30663/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d84ccb91fb20f58aa9778952c060eea2273c69c0e40d164972562ff8c0ead9d
3
+ size 42881168
train_data/muril_ch_domain/checkpoint-30663/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed5ec095ef64893bcddcfeca72b34c0cfd61506702bf2fcf85933f888ec0e1d
3
+ size 85843898
train_data/muril_ch_domain/checkpoint-30663/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c845563310a221ad0236804a9577e46a5928023e42425fd9e7226668ec75f0f
3
+ size 14244
train_data/muril_ch_domain/checkpoint-30663/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cbeb707e99493c97d2cffe0b096a573843cb8f847ed6fbc608d5da54abe9076
3
+ size 1064
train_data/muril_ch_domain/checkpoint-30663/trainer_state.json ADDED
@@ -0,0 +1,2602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.999853250501394,
5
+ "eval_steps": 500,
6
+ "global_step": 30663,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009783299907058651,
13
+ "grad_norm": 33.780609130859375,
14
+ "learning_rate": 1.6302575806977503e-06,
15
+ "loss": 6.574,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.019566599814117302,
20
+ "grad_norm": 33.439022064208984,
21
+ "learning_rate": 3.2605151613955006e-06,
22
+ "loss": 6.1653,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.02934989972117595,
27
+ "grad_norm": 27.25751304626465,
28
+ "learning_rate": 4.890772742093251e-06,
29
+ "loss": 5.5515,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.039133199628234604,
34
+ "grad_norm": 38.36979675292969,
35
+ "learning_rate": 6.521030322791001e-06,
36
+ "loss": 5.0531,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.048916499535293256,
41
+ "grad_norm": 29.350488662719727,
42
+ "learning_rate": 8.15128790348875e-06,
43
+ "loss": 4.9225,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.048916499535293256,
48
+ "eval_runtime": 181.5812,
49
+ "eval_samples_per_second": 112.578,
50
+ "eval_steps_per_second": 14.076,
51
+ "step": 500
52
+ },
53
+ {
54
+ "epoch": 0.0586997994423519,
55
+ "grad_norm": 33.02122497558594,
56
+ "learning_rate": 9.781545484186502e-06,
57
+ "loss": 4.8186,
58
+ "step": 600
59
+ },
60
+ {
61
+ "epoch": 0.06848309934941056,
62
+ "grad_norm": 42.41593933105469,
63
+ "learning_rate": 1.1411803064884251e-05,
64
+ "loss": 4.5769,
65
+ "step": 700
66
+ },
67
+ {
68
+ "epoch": 0.07826639925646921,
69
+ "grad_norm": 40.29044723510742,
70
+ "learning_rate": 1.3042060645582003e-05,
71
+ "loss": 4.3963,
72
+ "step": 800
73
+ },
74
+ {
75
+ "epoch": 0.08804969916352785,
76
+ "grad_norm": 38.0811653137207,
77
+ "learning_rate": 1.4672318226279752e-05,
78
+ "loss": 4.3393,
79
+ "step": 900
80
+ },
81
+ {
82
+ "epoch": 0.09783299907058651,
83
+ "grad_norm": 36.08370590209961,
84
+ "learning_rate": 1.63025758069775e-05,
85
+ "loss": 4.2421,
86
+ "step": 1000
87
+ },
88
+ {
89
+ "epoch": 0.09783299907058651,
90
+ "eval_runtime": 181.886,
91
+ "eval_samples_per_second": 112.389,
92
+ "eval_steps_per_second": 14.053,
93
+ "step": 1000
94
+ },
95
+ {
96
+ "epoch": 0.10761629897764516,
97
+ "grad_norm": 37.253684997558594,
98
+ "learning_rate": 1.7932833387675256e-05,
99
+ "loss": 4.1156,
100
+ "step": 1100
101
+ },
102
+ {
103
+ "epoch": 0.1173995988847038,
104
+ "grad_norm": 33.003475189208984,
105
+ "learning_rate": 1.9563090968373004e-05,
106
+ "loss": 4.0112,
107
+ "step": 1200
108
+ },
109
+ {
110
+ "epoch": 0.12718289879176245,
111
+ "grad_norm": 30.727867126464844,
112
+ "learning_rate": 2.1193348549070755e-05,
113
+ "loss": 3.9969,
114
+ "step": 1300
115
+ },
116
+ {
117
+ "epoch": 0.13696619869882112,
118
+ "grad_norm": 37.471092224121094,
119
+ "learning_rate": 2.2823606129768503e-05,
120
+ "loss": 3.874,
121
+ "step": 1400
122
+ },
123
+ {
124
+ "epoch": 0.14674949860587977,
125
+ "grad_norm": 42.32167434692383,
126
+ "learning_rate": 2.4453863710466254e-05,
127
+ "loss": 3.8518,
128
+ "step": 1500
129
+ },
130
+ {
131
+ "epoch": 0.14674949860587977,
132
+ "eval_runtime": 181.9332,
133
+ "eval_samples_per_second": 112.36,
134
+ "eval_steps_per_second": 14.049,
135
+ "step": 1500
136
+ },
137
+ {
138
+ "epoch": 0.15653279851293841,
139
+ "grad_norm": 38.00124740600586,
140
+ "learning_rate": 2.6084121291164005e-05,
141
+ "loss": 3.918,
142
+ "step": 1600
143
+ },
144
+ {
145
+ "epoch": 0.16631609841999706,
146
+ "grad_norm": 44.637386322021484,
147
+ "learning_rate": 2.7714378871861756e-05,
148
+ "loss": 3.9134,
149
+ "step": 1700
150
+ },
151
+ {
152
+ "epoch": 0.1760993983270557,
153
+ "grad_norm": 49.578609466552734,
154
+ "learning_rate": 2.9344636452559504e-05,
155
+ "loss": 3.7507,
156
+ "step": 1800
157
+ },
158
+ {
159
+ "epoch": 0.18588269823411438,
160
+ "grad_norm": 36.65715789794922,
161
+ "learning_rate": 3.0974894033257255e-05,
162
+ "loss": 3.7551,
163
+ "step": 1900
164
+ },
165
+ {
166
+ "epoch": 0.19566599814117303,
167
+ "grad_norm": 36.873443603515625,
168
+ "learning_rate": 3.2605151613955e-05,
169
+ "loss": 3.6951,
170
+ "step": 2000
171
+ },
172
+ {
173
+ "epoch": 0.19566599814117303,
174
+ "eval_runtime": 181.8273,
175
+ "eval_samples_per_second": 112.425,
176
+ "eval_steps_per_second": 14.057,
177
+ "step": 2000
178
+ },
179
+ {
180
+ "epoch": 0.20544929804823167,
181
+ "grad_norm": 33.025413513183594,
182
+ "learning_rate": 3.423540919465276e-05,
183
+ "loss": 3.6603,
184
+ "step": 2100
185
+ },
186
+ {
187
+ "epoch": 0.21523259795529032,
188
+ "grad_norm": 30.105051040649414,
189
+ "learning_rate": 3.586566677535051e-05,
190
+ "loss": 3.525,
191
+ "step": 2200
192
+ },
193
+ {
194
+ "epoch": 0.22501589786234896,
195
+ "grad_norm": 34.5129280090332,
196
+ "learning_rate": 3.749592435604825e-05,
197
+ "loss": 3.6454,
198
+ "step": 2300
199
+ },
200
+ {
201
+ "epoch": 0.2347991977694076,
202
+ "grad_norm": 33.16934585571289,
203
+ "learning_rate": 3.912618193674601e-05,
204
+ "loss": 3.6356,
205
+ "step": 2400
206
+ },
207
+ {
208
+ "epoch": 0.24458249767646628,
209
+ "grad_norm": 33.5789794921875,
210
+ "learning_rate": 4.0756439517443756e-05,
211
+ "loss": 3.5605,
212
+ "step": 2500
213
+ },
214
+ {
215
+ "epoch": 0.24458249767646628,
216
+ "eval_runtime": 181.7254,
217
+ "eval_samples_per_second": 112.488,
218
+ "eval_steps_per_second": 14.065,
219
+ "step": 2500
220
+ },
221
+ {
222
+ "epoch": 0.2543657975835249,
223
+ "grad_norm": 34.30876159667969,
224
+ "learning_rate": 4.238669709814151e-05,
225
+ "loss": 3.5447,
226
+ "step": 2600
227
+ },
228
+ {
229
+ "epoch": 0.2641490974905836,
230
+ "grad_norm": 29.907989501953125,
231
+ "learning_rate": 4.401695467883926e-05,
232
+ "loss": 3.5116,
233
+ "step": 2700
234
+ },
235
+ {
236
+ "epoch": 0.27393239739764225,
237
+ "grad_norm": 34.08231735229492,
238
+ "learning_rate": 4.5647212259537006e-05,
239
+ "loss": 3.4941,
240
+ "step": 2800
241
+ },
242
+ {
243
+ "epoch": 0.28371569730470086,
244
+ "grad_norm": 25.034149169921875,
245
+ "learning_rate": 4.727746984023476e-05,
246
+ "loss": 3.4863,
247
+ "step": 2900
248
+ },
249
+ {
250
+ "epoch": 0.29349899721175954,
251
+ "grad_norm": 32.21685028076172,
252
+ "learning_rate": 4.890772742093251e-05,
253
+ "loss": 3.5096,
254
+ "step": 3000
255
+ },
256
+ {
257
+ "epoch": 0.29349899721175954,
258
+ "eval_runtime": 181.6612,
259
+ "eval_samples_per_second": 112.528,
260
+ "eval_steps_per_second": 14.07,
261
+ "step": 3000
262
+ },
263
+ {
264
+ "epoch": 0.30328229711881816,
265
+ "grad_norm": 24.290380477905273,
266
+ "learning_rate": 4.9940208725902305e-05,
267
+ "loss": 3.3867,
268
+ "step": 3100
269
+ },
270
+ {
271
+ "epoch": 0.31306559702587683,
272
+ "grad_norm": 22.924575805664062,
273
+ "learning_rate": 4.975902304681838e-05,
274
+ "loss": 3.398,
275
+ "step": 3200
276
+ },
277
+ {
278
+ "epoch": 0.3228488969329355,
279
+ "grad_norm": 19.540430068969727,
280
+ "learning_rate": 4.957783736773446e-05,
281
+ "loss": 3.3727,
282
+ "step": 3300
283
+ },
284
+ {
285
+ "epoch": 0.3326321968399941,
286
+ "grad_norm": 22.529376983642578,
287
+ "learning_rate": 4.939665168865053e-05,
288
+ "loss": 3.3364,
289
+ "step": 3400
290
+ },
291
+ {
292
+ "epoch": 0.3424154967470528,
293
+ "grad_norm": 20.821264266967773,
294
+ "learning_rate": 4.921546600956661e-05,
295
+ "loss": 3.3126,
296
+ "step": 3500
297
+ },
298
+ {
299
+ "epoch": 0.3424154967470528,
300
+ "eval_runtime": 181.7582,
301
+ "eval_samples_per_second": 112.468,
302
+ "eval_steps_per_second": 14.063,
303
+ "step": 3500
304
+ },
305
+ {
306
+ "epoch": 0.3521987966541114,
307
+ "grad_norm": 24.346153259277344,
308
+ "learning_rate": 4.903428033048268e-05,
309
+ "loss": 3.2678,
310
+ "step": 3600
311
+ },
312
+ {
313
+ "epoch": 0.3619820965611701,
314
+ "grad_norm": 19.89035415649414,
315
+ "learning_rate": 4.8853094651398754e-05,
316
+ "loss": 3.3233,
317
+ "step": 3700
318
+ },
319
+ {
320
+ "epoch": 0.37176539646822876,
321
+ "grad_norm": 17.938880920410156,
322
+ "learning_rate": 4.8671908972314825e-05,
323
+ "loss": 3.2822,
324
+ "step": 3800
325
+ },
326
+ {
327
+ "epoch": 0.3815486963752874,
328
+ "grad_norm": 16.92071533203125,
329
+ "learning_rate": 4.84907232932309e-05,
330
+ "loss": 3.2254,
331
+ "step": 3900
332
+ },
333
+ {
334
+ "epoch": 0.39133199628234605,
335
+ "grad_norm": 18.241249084472656,
336
+ "learning_rate": 4.830953761414698e-05,
337
+ "loss": 3.2116,
338
+ "step": 4000
339
+ },
340
+ {
341
+ "epoch": 0.39133199628234605,
342
+ "eval_runtime": 182.8906,
343
+ "eval_samples_per_second": 111.772,
344
+ "eval_steps_per_second": 13.976,
345
+ "step": 4000
346
+ },
347
+ {
348
+ "epoch": 0.40111529618940467,
349
+ "grad_norm": 17.56020736694336,
350
+ "learning_rate": 4.812835193506305e-05,
351
+ "loss": 3.2232,
352
+ "step": 4100
353
+ },
354
+ {
355
+ "epoch": 0.41089859609646334,
356
+ "grad_norm": 17.81117057800293,
357
+ "learning_rate": 4.794716625597913e-05,
358
+ "loss": 3.1936,
359
+ "step": 4200
360
+ },
361
+ {
362
+ "epoch": 0.420681896003522,
363
+ "grad_norm": 19.89581871032715,
364
+ "learning_rate": 4.77659805768952e-05,
365
+ "loss": 3.1443,
366
+ "step": 4300
367
+ },
368
+ {
369
+ "epoch": 0.43046519591058063,
370
+ "grad_norm": 22.968582153320312,
371
+ "learning_rate": 4.758479489781128e-05,
372
+ "loss": 3.2084,
373
+ "step": 4400
374
+ },
375
+ {
376
+ "epoch": 0.4402484958176393,
377
+ "grad_norm": 17.119598388671875,
378
+ "learning_rate": 4.740360921872735e-05,
379
+ "loss": 3.1263,
380
+ "step": 4500
381
+ },
382
+ {
383
+ "epoch": 0.4402484958176393,
384
+ "eval_runtime": 182.3246,
385
+ "eval_samples_per_second": 112.119,
386
+ "eval_steps_per_second": 14.019,
387
+ "step": 4500
388
+ },
389
+ {
390
+ "epoch": 0.4500317957246979,
391
+ "grad_norm": 19.294527053833008,
392
+ "learning_rate": 4.722242353964343e-05,
393
+ "loss": 3.1327,
394
+ "step": 4600
395
+ },
396
+ {
397
+ "epoch": 0.4598150956317566,
398
+ "grad_norm": 16.941057205200195,
399
+ "learning_rate": 4.704123786055951e-05,
400
+ "loss": 3.0944,
401
+ "step": 4700
402
+ },
403
+ {
404
+ "epoch": 0.4695983955388152,
405
+ "grad_norm": 22.43411636352539,
406
+ "learning_rate": 4.686005218147558e-05,
407
+ "loss": 3.1093,
408
+ "step": 4800
409
+ },
410
+ {
411
+ "epoch": 0.4793816954458739,
412
+ "grad_norm": 19.64097023010254,
413
+ "learning_rate": 4.667886650239166e-05,
414
+ "loss": 3.0597,
415
+ "step": 4900
416
+ },
417
+ {
418
+ "epoch": 0.48916499535293256,
419
+ "grad_norm": 19.343788146972656,
420
+ "learning_rate": 4.649768082330773e-05,
421
+ "loss": 3.1659,
422
+ "step": 5000
423
+ },
424
+ {
425
+ "epoch": 0.48916499535293256,
426
+ "eval_runtime": 181.8771,
427
+ "eval_samples_per_second": 112.395,
428
+ "eval_steps_per_second": 14.053,
429
+ "step": 5000
430
+ },
431
+ {
432
+ "epoch": 0.4989482952599912,
433
+ "grad_norm": 19.657760620117188,
434
+ "learning_rate": 4.63164951442238e-05,
435
+ "loss": 3.0506,
436
+ "step": 5100
437
+ },
438
+ {
439
+ "epoch": 0.5087315951670498,
440
+ "grad_norm": 16.2425537109375,
441
+ "learning_rate": 4.613530946513987e-05,
442
+ "loss": 3.0524,
443
+ "step": 5200
444
+ },
445
+ {
446
+ "epoch": 0.5185148950741085,
447
+ "grad_norm": 19.64779281616211,
448
+ "learning_rate": 4.595412378605595e-05,
449
+ "loss": 2.9995,
450
+ "step": 5300
451
+ },
452
+ {
453
+ "epoch": 0.5282981949811671,
454
+ "grad_norm": 17.29520606994629,
455
+ "learning_rate": 4.577293810697203e-05,
456
+ "loss": 3.0932,
457
+ "step": 5400
458
+ },
459
+ {
460
+ "epoch": 0.5380814948882258,
461
+ "grad_norm": 17.694602966308594,
462
+ "learning_rate": 4.55917524278881e-05,
463
+ "loss": 3.0309,
464
+ "step": 5500
465
+ },
466
+ {
467
+ "epoch": 0.5380814948882258,
468
+ "eval_runtime": 181.7231,
469
+ "eval_samples_per_second": 112.49,
470
+ "eval_steps_per_second": 14.065,
471
+ "step": 5500
472
+ },
473
+ {
474
+ "epoch": 0.5478647947952845,
475
+ "grad_norm": 21.030174255371094,
476
+ "learning_rate": 4.541056674880418e-05,
477
+ "loss": 3.0313,
478
+ "step": 5600
479
+ },
480
+ {
481
+ "epoch": 0.5576480947023431,
482
+ "grad_norm": 12.339129447937012,
483
+ "learning_rate": 4.522938106972025e-05,
484
+ "loss": 3.047,
485
+ "step": 5700
486
+ },
487
+ {
488
+ "epoch": 0.5674313946094017,
489
+ "grad_norm": 16.496389389038086,
490
+ "learning_rate": 4.504819539063633e-05,
491
+ "loss": 2.9961,
492
+ "step": 5800
493
+ },
494
+ {
495
+ "epoch": 0.5772146945164603,
496
+ "grad_norm": 15.456297874450684,
497
+ "learning_rate": 4.48670097115524e-05,
498
+ "loss": 2.9821,
499
+ "step": 5900
500
+ },
501
+ {
502
+ "epoch": 0.5869979944235191,
503
+ "grad_norm": 17.8603572845459,
504
+ "learning_rate": 4.468582403246848e-05,
505
+ "loss": 2.9294,
506
+ "step": 6000
507
+ },
508
+ {
509
+ "epoch": 0.5869979944235191,
510
+ "eval_runtime": 181.8258,
511
+ "eval_samples_per_second": 112.426,
512
+ "eval_steps_per_second": 14.057,
513
+ "step": 6000
514
+ },
515
+ {
516
+ "epoch": 0.5967812943305777,
517
+ "grad_norm": 18.85349464416504,
518
+ "learning_rate": 4.450463835338455e-05,
519
+ "loss": 2.9929,
520
+ "step": 6100
521
+ },
522
+ {
523
+ "epoch": 0.6065645942376363,
524
+ "grad_norm": 22.971813201904297,
525
+ "learning_rate": 4.432345267430063e-05,
526
+ "loss": 2.9684,
527
+ "step": 6200
528
+ },
529
+ {
530
+ "epoch": 0.616347894144695,
531
+ "grad_norm": 15.877230644226074,
532
+ "learning_rate": 4.4142266995216706e-05,
533
+ "loss": 2.9399,
534
+ "step": 6300
535
+ },
536
+ {
537
+ "epoch": 0.6261311940517537,
538
+ "grad_norm": 19.847482681274414,
539
+ "learning_rate": 4.396108131613278e-05,
540
+ "loss": 2.88,
541
+ "step": 6400
542
+ },
543
+ {
544
+ "epoch": 0.6359144939588123,
545
+ "grad_norm": 15.004170417785645,
546
+ "learning_rate": 4.377989563704885e-05,
547
+ "loss": 2.9719,
548
+ "step": 6500
549
+ },
550
+ {
551
+ "epoch": 0.6359144939588123,
552
+ "eval_runtime": 182.6045,
553
+ "eval_samples_per_second": 111.947,
554
+ "eval_steps_per_second": 13.997,
555
+ "step": 6500
556
+ },
557
+ {
558
+ "epoch": 0.645697793865871,
559
+ "grad_norm": 19.473665237426758,
560
+ "learning_rate": 4.359870995796492e-05,
561
+ "loss": 2.9246,
562
+ "step": 6600
563
+ },
564
+ {
565
+ "epoch": 0.6554810937729296,
566
+ "grad_norm": 18.071683883666992,
567
+ "learning_rate": 4.3417524278881e-05,
568
+ "loss": 2.9031,
569
+ "step": 6700
570
+ },
571
+ {
572
+ "epoch": 0.6652643936799882,
573
+ "grad_norm": 17.544504165649414,
574
+ "learning_rate": 4.323633859979707e-05,
575
+ "loss": 2.8313,
576
+ "step": 6800
577
+ },
578
+ {
579
+ "epoch": 0.6750476935870469,
580
+ "grad_norm": 18.936140060424805,
581
+ "learning_rate": 4.305515292071315e-05,
582
+ "loss": 2.8536,
583
+ "step": 6900
584
+ },
585
+ {
586
+ "epoch": 0.6848309934941056,
587
+ "grad_norm": 14.77696418762207,
588
+ "learning_rate": 4.2873967241629226e-05,
589
+ "loss": 2.9104,
590
+ "step": 7000
591
+ },
592
+ {
593
+ "epoch": 0.6848309934941056,
594
+ "eval_runtime": 181.938,
595
+ "eval_samples_per_second": 112.357,
596
+ "eval_steps_per_second": 14.049,
597
+ "step": 7000
598
+ },
599
+ {
600
+ "epoch": 0.6946142934011642,
601
+ "grad_norm": 14.303226470947266,
602
+ "learning_rate": 4.26927815625453e-05,
603
+ "loss": 2.8386,
604
+ "step": 7100
605
+ },
606
+ {
607
+ "epoch": 0.7043975933082228,
608
+ "grad_norm": 17.11782455444336,
609
+ "learning_rate": 4.2511595883461376e-05,
610
+ "loss": 2.9013,
611
+ "step": 7200
612
+ },
613
+ {
614
+ "epoch": 0.7141808932152816,
615
+ "grad_norm": 18.661100387573242,
616
+ "learning_rate": 4.233041020437745e-05,
617
+ "loss": 2.9428,
618
+ "step": 7300
619
+ },
620
+ {
621
+ "epoch": 0.7239641931223402,
622
+ "grad_norm": 15.535719871520996,
623
+ "learning_rate": 4.2149224525293525e-05,
624
+ "loss": 2.8582,
625
+ "step": 7400
626
+ },
627
+ {
628
+ "epoch": 0.7337474930293988,
629
+ "grad_norm": 15.3306303024292,
630
+ "learning_rate": 4.19680388462096e-05,
631
+ "loss": 2.8896,
632
+ "step": 7500
633
+ },
634
+ {
635
+ "epoch": 0.7337474930293988,
636
+ "eval_runtime": 181.8938,
637
+ "eval_samples_per_second": 112.384,
638
+ "eval_steps_per_second": 14.052,
639
+ "step": 7500
640
+ },
641
+ {
642
+ "epoch": 0.7435307929364575,
643
+ "grad_norm": 16.730344772338867,
644
+ "learning_rate": 4.1786853167125675e-05,
645
+ "loss": 2.9097,
646
+ "step": 7600
647
+ },
648
+ {
649
+ "epoch": 0.7533140928435161,
650
+ "grad_norm": 18.755483627319336,
651
+ "learning_rate": 4.1605667488041746e-05,
652
+ "loss": 2.8815,
653
+ "step": 7700
654
+ },
655
+ {
656
+ "epoch": 0.7630973927505748,
657
+ "grad_norm": 18.737581253051758,
658
+ "learning_rate": 4.1424481808957824e-05,
659
+ "loss": 2.9202,
660
+ "step": 7800
661
+ },
662
+ {
663
+ "epoch": 0.7728806926576334,
664
+ "grad_norm": 14.711681365966797,
665
+ "learning_rate": 4.1243296129873896e-05,
666
+ "loss": 2.806,
667
+ "step": 7900
668
+ },
669
+ {
670
+ "epoch": 0.7826639925646921,
671
+ "grad_norm": 17.5069580078125,
672
+ "learning_rate": 4.106211045078997e-05,
673
+ "loss": 2.8576,
674
+ "step": 8000
675
+ },
676
+ {
677
+ "epoch": 0.7826639925646921,
678
+ "eval_runtime": 181.9442,
679
+ "eval_samples_per_second": 112.353,
680
+ "eval_steps_per_second": 14.048,
681
+ "step": 8000
682
+ },
683
+ {
684
+ "epoch": 0.7924472924717507,
685
+ "grad_norm": 17.678852081298828,
686
+ "learning_rate": 4.0880924771706046e-05,
687
+ "loss": 2.8035,
688
+ "step": 8100
689
+ },
690
+ {
691
+ "epoch": 0.8022305923788093,
692
+ "grad_norm": 17.644638061523438,
693
+ "learning_rate": 4.069973909262212e-05,
694
+ "loss": 2.7958,
695
+ "step": 8200
696
+ },
697
+ {
698
+ "epoch": 0.8120138922858681,
699
+ "grad_norm": 18.377134323120117,
700
+ "learning_rate": 4.0518553413538195e-05,
701
+ "loss": 2.8055,
702
+ "step": 8300
703
+ },
704
+ {
705
+ "epoch": 0.8217971921929267,
706
+ "grad_norm": 18.026033401489258,
707
+ "learning_rate": 4.0337367734454273e-05,
708
+ "loss": 2.7334,
709
+ "step": 8400
710
+ },
711
+ {
712
+ "epoch": 0.8315804920999853,
713
+ "grad_norm": 14.77315616607666,
714
+ "learning_rate": 4.0156182055370345e-05,
715
+ "loss": 2.8082,
716
+ "step": 8500
717
+ },
718
+ {
719
+ "epoch": 0.8315804920999853,
720
+ "eval_runtime": 182.4176,
721
+ "eval_samples_per_second": 112.062,
722
+ "eval_steps_per_second": 14.012,
723
+ "step": 8500
724
+ },
725
+ {
726
+ "epoch": 0.841363792007044,
727
+ "grad_norm": 13.729479789733887,
728
+ "learning_rate": 3.997499637628642e-05,
729
+ "loss": 2.7939,
730
+ "step": 8600
731
+ },
732
+ {
733
+ "epoch": 0.8511470919141026,
734
+ "grad_norm": 16.34333610534668,
735
+ "learning_rate": 3.9793810697202494e-05,
736
+ "loss": 2.8517,
737
+ "step": 8700
738
+ },
739
+ {
740
+ "epoch": 0.8609303918211613,
741
+ "grad_norm": 22.484411239624023,
742
+ "learning_rate": 3.961262501811857e-05,
743
+ "loss": 2.776,
744
+ "step": 8800
745
+ },
746
+ {
747
+ "epoch": 0.8707136917282199,
748
+ "grad_norm": 15.922870635986328,
749
+ "learning_rate": 3.9431439339034644e-05,
750
+ "loss": 2.7909,
751
+ "step": 8900
752
+ },
753
+ {
754
+ "epoch": 0.8804969916352786,
755
+ "grad_norm": 15.06955623626709,
756
+ "learning_rate": 3.925025365995072e-05,
757
+ "loss": 2.8416,
758
+ "step": 9000
759
+ },
760
+ {
761
+ "epoch": 0.8804969916352786,
762
+ "eval_runtime": 181.9314,
763
+ "eval_samples_per_second": 112.361,
764
+ "eval_steps_per_second": 14.049,
765
+ "step": 9000
766
+ },
767
+ {
768
+ "epoch": 0.8902802915423372,
769
+ "grad_norm": 16.060428619384766,
770
+ "learning_rate": 3.9069067980866794e-05,
771
+ "loss": 2.7803,
772
+ "step": 9100
773
+ },
774
+ {
775
+ "epoch": 0.9000635914493959,
776
+ "grad_norm": 16.80124855041504,
777
+ "learning_rate": 3.888788230178287e-05,
778
+ "loss": 2.7548,
779
+ "step": 9200
780
+ },
781
+ {
782
+ "epoch": 0.9098468913564546,
783
+ "grad_norm": 16.608434677124023,
784
+ "learning_rate": 3.870669662269894e-05,
785
+ "loss": 2.8606,
786
+ "step": 9300
787
+ },
788
+ {
789
+ "epoch": 0.9196301912635132,
790
+ "grad_norm": 14.83870792388916,
791
+ "learning_rate": 3.8525510943615015e-05,
792
+ "loss": 2.7833,
793
+ "step": 9400
794
+ },
795
+ {
796
+ "epoch": 0.9294134911705718,
797
+ "grad_norm": 25.778181076049805,
798
+ "learning_rate": 3.834432526453109e-05,
799
+ "loss": 2.7434,
800
+ "step": 9500
801
+ },
802
+ {
803
+ "epoch": 0.9294134911705718,
804
+ "eval_runtime": 181.99,
805
+ "eval_samples_per_second": 112.325,
806
+ "eval_steps_per_second": 14.045,
807
+ "step": 9500
808
+ },
809
+ {
810
+ "epoch": 0.9391967910776304,
811
+ "grad_norm": 17.374011993408203,
812
+ "learning_rate": 3.8163139585447164e-05,
813
+ "loss": 2.7258,
814
+ "step": 9600
815
+ },
816
+ {
817
+ "epoch": 0.9489800909846892,
818
+ "grad_norm": 17.551128387451172,
819
+ "learning_rate": 3.798195390636324e-05,
820
+ "loss": 2.824,
821
+ "step": 9700
822
+ },
823
+ {
824
+ "epoch": 0.9587633908917478,
825
+ "grad_norm": 14.35797119140625,
826
+ "learning_rate": 3.7800768227279314e-05,
827
+ "loss": 2.745,
828
+ "step": 9800
829
+ },
830
+ {
831
+ "epoch": 0.9685466907988064,
832
+ "grad_norm": 20.098552703857422,
833
+ "learning_rate": 3.761958254819539e-05,
834
+ "loss": 2.7025,
835
+ "step": 9900
836
+ },
837
+ {
838
+ "epoch": 0.9783299907058651,
839
+ "grad_norm": 16.218109130859375,
840
+ "learning_rate": 3.743839686911147e-05,
841
+ "loss": 2.8093,
842
+ "step": 10000
843
+ },
844
+ {
845
+ "epoch": 0.9783299907058651,
846
+ "eval_runtime": 181.8987,
847
+ "eval_samples_per_second": 112.381,
848
+ "eval_steps_per_second": 14.052,
849
+ "step": 10000
850
+ },
851
+ {
852
+ "epoch": 0.9881132906129237,
853
+ "grad_norm": 17.198423385620117,
854
+ "learning_rate": 3.725721119002754e-05,
855
+ "loss": 2.7124,
856
+ "step": 10100
857
+ },
858
+ {
859
+ "epoch": 0.9978965905199824,
860
+ "grad_norm": 18.021198272705078,
861
+ "learning_rate": 3.707602551094362e-05,
862
+ "loss": 2.6922,
863
+ "step": 10200
864
+ },
865
+ {
866
+ "epoch": 1.007679890427041,
867
+ "grad_norm": 15.27678108215332,
868
+ "learning_rate": 3.689483983185969e-05,
869
+ "loss": 2.6743,
870
+ "step": 10300
871
+ },
872
+ {
873
+ "epoch": 1.0174631903340996,
874
+ "grad_norm": 16.770511627197266,
875
+ "learning_rate": 3.671365415277577e-05,
876
+ "loss": 2.857,
877
+ "step": 10400
878
+ },
879
+ {
880
+ "epoch": 1.0272464902411584,
881
+ "grad_norm": 18.810932159423828,
882
+ "learning_rate": 3.653246847369184e-05,
883
+ "loss": 2.7269,
884
+ "step": 10500
885
+ },
886
+ {
887
+ "epoch": 1.0272464902411584,
888
+ "eval_runtime": 181.8537,
889
+ "eval_samples_per_second": 112.409,
890
+ "eval_steps_per_second": 14.055,
891
+ "step": 10500
892
+ },
893
+ {
894
+ "epoch": 1.037029790148217,
895
+ "grad_norm": 18.56201171875,
896
+ "learning_rate": 3.635128279460791e-05,
897
+ "loss": 2.7325,
898
+ "step": 10600
899
+ },
900
+ {
901
+ "epoch": 1.0468130900552757,
902
+ "grad_norm": 15.063011169433594,
903
+ "learning_rate": 3.617009711552399e-05,
904
+ "loss": 2.7827,
905
+ "step": 10700
906
+ },
907
+ {
908
+ "epoch": 1.0565963899623343,
909
+ "grad_norm": 15.339439392089844,
910
+ "learning_rate": 3.598891143644006e-05,
911
+ "loss": 2.7472,
912
+ "step": 10800
913
+ },
914
+ {
915
+ "epoch": 1.066379689869393,
916
+ "grad_norm": 17.466033935546875,
917
+ "learning_rate": 3.580772575735614e-05,
918
+ "loss": 2.7859,
919
+ "step": 10900
920
+ },
921
+ {
922
+ "epoch": 1.0761629897764515,
923
+ "grad_norm": 20.727872848510742,
924
+ "learning_rate": 3.562654007827221e-05,
925
+ "loss": 2.7278,
926
+ "step": 11000
927
+ },
928
+ {
929
+ "epoch": 1.0761629897764515,
930
+ "eval_runtime": 181.8566,
931
+ "eval_samples_per_second": 112.407,
932
+ "eval_steps_per_second": 14.055,
933
+ "step": 11000
934
+ },
935
+ {
936
+ "epoch": 1.0859462896835101,
937
+ "grad_norm": 16.02055549621582,
938
+ "learning_rate": 3.544535439918829e-05,
939
+ "loss": 2.6307,
940
+ "step": 11100
941
+ },
942
+ {
943
+ "epoch": 1.095729589590569,
944
+ "grad_norm": 20.069686889648438,
945
+ "learning_rate": 3.526416872010436e-05,
946
+ "loss": 2.711,
947
+ "step": 11200
948
+ },
949
+ {
950
+ "epoch": 1.1055128894976276,
951
+ "grad_norm": 14.833261489868164,
952
+ "learning_rate": 3.508298304102044e-05,
953
+ "loss": 2.6141,
954
+ "step": 11300
955
+ },
956
+ {
957
+ "epoch": 1.1152961894046862,
958
+ "grad_norm": 14.86436653137207,
959
+ "learning_rate": 3.490179736193652e-05,
960
+ "loss": 2.6816,
961
+ "step": 11400
962
+ },
963
+ {
964
+ "epoch": 1.1250794893117448,
965
+ "grad_norm": 17.955862045288086,
966
+ "learning_rate": 3.472061168285259e-05,
967
+ "loss": 2.6924,
968
+ "step": 11500
969
+ },
970
+ {
971
+ "epoch": 1.1250794893117448,
972
+ "eval_runtime": 181.8085,
973
+ "eval_samples_per_second": 112.437,
974
+ "eval_steps_per_second": 14.059,
975
+ "step": 11500
976
+ },
977
+ {
978
+ "epoch": 1.1348627892188035,
979
+ "grad_norm": 18.360109329223633,
980
+ "learning_rate": 3.453942600376867e-05,
981
+ "loss": 2.6181,
982
+ "step": 11600
983
+ },
984
+ {
985
+ "epoch": 1.144646089125862,
986
+ "grad_norm": 17.547542572021484,
987
+ "learning_rate": 3.435824032468474e-05,
988
+ "loss": 2.6394,
989
+ "step": 11700
990
+ },
991
+ {
992
+ "epoch": 1.154429389032921,
993
+ "grad_norm": 12.194833755493164,
994
+ "learning_rate": 3.417705464560082e-05,
995
+ "loss": 2.6684,
996
+ "step": 11800
997
+ },
998
+ {
999
+ "epoch": 1.1642126889399795,
1000
+ "grad_norm": 17.095104217529297,
1001
+ "learning_rate": 3.399586896651689e-05,
1002
+ "loss": 2.6129,
1003
+ "step": 11900
1004
+ },
1005
+ {
1006
+ "epoch": 1.1739959888470382,
1007
+ "grad_norm": 20.788406372070312,
1008
+ "learning_rate": 3.381468328743296e-05,
1009
+ "loss": 2.5663,
1010
+ "step": 12000
1011
+ },
1012
+ {
1013
+ "epoch": 1.1739959888470382,
1014
+ "eval_runtime": 181.8035,
1015
+ "eval_samples_per_second": 112.44,
1016
+ "eval_steps_per_second": 14.059,
1017
+ "step": 12000
1018
+ },
1019
+ {
1020
+ "epoch": 1.1837792887540968,
1021
+ "grad_norm": 14.261167526245117,
1022
+ "learning_rate": 3.363349760834904e-05,
1023
+ "loss": 2.6544,
1024
+ "step": 12100
1025
+ },
1026
+ {
1027
+ "epoch": 1.1935625886611554,
1028
+ "grad_norm": 24.68012046813965,
1029
+ "learning_rate": 3.345231192926511e-05,
1030
+ "loss": 2.6632,
1031
+ "step": 12200
1032
+ },
1033
+ {
1034
+ "epoch": 1.203345888568214,
1035
+ "grad_norm": 16.10886573791504,
1036
+ "learning_rate": 3.327112625018119e-05,
1037
+ "loss": 2.6366,
1038
+ "step": 12300
1039
+ },
1040
+ {
1041
+ "epoch": 1.2131291884752726,
1042
+ "grad_norm": 18.038848876953125,
1043
+ "learning_rate": 3.308994057109726e-05,
1044
+ "loss": 2.6563,
1045
+ "step": 12400
1046
+ },
1047
+ {
1048
+ "epoch": 1.2229124883823315,
1049
+ "grad_norm": 17.40920639038086,
1050
+ "learning_rate": 3.290875489201334e-05,
1051
+ "loss": 2.718,
1052
+ "step": 12500
1053
+ },
1054
+ {
1055
+ "epoch": 1.2229124883823315,
1056
+ "eval_runtime": 181.9491,
1057
+ "eval_samples_per_second": 112.35,
1058
+ "eval_steps_per_second": 14.048,
1059
+ "step": 12500
1060
+ },
1061
+ {
1062
+ "epoch": 1.23269578828939,
1063
+ "grad_norm": 15.097307205200195,
1064
+ "learning_rate": 3.272756921292941e-05,
1065
+ "loss": 2.7282,
1066
+ "step": 12600
1067
+ },
1068
+ {
1069
+ "epoch": 1.2424790881964487,
1070
+ "grad_norm": 17.63008689880371,
1071
+ "learning_rate": 3.254638353384549e-05,
1072
+ "loss": 2.7104,
1073
+ "step": 12700
1074
+ },
1075
+ {
1076
+ "epoch": 1.2522623881035073,
1077
+ "grad_norm": 16.161130905151367,
1078
+ "learning_rate": 3.236519785476156e-05,
1079
+ "loss": 2.6427,
1080
+ "step": 12800
1081
+ },
1082
+ {
1083
+ "epoch": 1.262045688010566,
1084
+ "grad_norm": 18.786882400512695,
1085
+ "learning_rate": 3.218401217567764e-05,
1086
+ "loss": 2.6105,
1087
+ "step": 12900
1088
+ },
1089
+ {
1090
+ "epoch": 1.2718289879176246,
1091
+ "grad_norm": 24.145421981811523,
1092
+ "learning_rate": 3.2002826496593715e-05,
1093
+ "loss": 2.6322,
1094
+ "step": 13000
1095
+ },
1096
+ {
1097
+ "epoch": 1.2718289879176246,
1098
+ "eval_runtime": 182.5613,
1099
+ "eval_samples_per_second": 111.973,
1100
+ "eval_steps_per_second": 14.001,
1101
+ "step": 13000
1102
+ },
1103
+ {
1104
+ "epoch": 1.2816122878246832,
1105
+ "grad_norm": 15.286133766174316,
1106
+ "learning_rate": 3.1821640817509786e-05,
1107
+ "loss": 2.6465,
1108
+ "step": 13100
1109
+ },
1110
+ {
1111
+ "epoch": 1.291395587731742,
1112
+ "grad_norm": 21.22935676574707,
1113
+ "learning_rate": 3.1640455138425865e-05,
1114
+ "loss": 2.6691,
1115
+ "step": 13200
1116
+ },
1117
+ {
1118
+ "epoch": 1.3011788876388006,
1119
+ "grad_norm": 18.064428329467773,
1120
+ "learning_rate": 3.1459269459341936e-05,
1121
+ "loss": 2.5904,
1122
+ "step": 13300
1123
+ },
1124
+ {
1125
+ "epoch": 1.3109621875458592,
1126
+ "grad_norm": 14.45976448059082,
1127
+ "learning_rate": 3.127808378025801e-05,
1128
+ "loss": 2.6602,
1129
+ "step": 13400
1130
+ },
1131
+ {
1132
+ "epoch": 1.3207454874529179,
1133
+ "grad_norm": 19.72386360168457,
1134
+ "learning_rate": 3.109689810117408e-05,
1135
+ "loss": 2.6337,
1136
+ "step": 13500
1137
+ },
1138
+ {
1139
+ "epoch": 1.3207454874529179,
1140
+ "eval_runtime": 182.4053,
1141
+ "eval_samples_per_second": 112.069,
1142
+ "eval_steps_per_second": 14.013,
1143
+ "step": 13500
1144
+ },
1145
+ {
1146
+ "epoch": 1.3305287873599765,
1147
+ "grad_norm": 17.639583587646484,
1148
+ "learning_rate": 3.091571242209016e-05,
1149
+ "loss": 2.6135,
1150
+ "step": 13600
1151
+ },
1152
+ {
1153
+ "epoch": 1.340312087267035,
1154
+ "grad_norm": 19.71700096130371,
1155
+ "learning_rate": 3.0734526743006235e-05,
1156
+ "loss": 2.6252,
1157
+ "step": 13700
1158
+ },
1159
+ {
1160
+ "epoch": 1.3500953871740937,
1161
+ "grad_norm": 16.715856552124023,
1162
+ "learning_rate": 3.055334106392231e-05,
1163
+ "loss": 2.6475,
1164
+ "step": 13800
1165
+ },
1166
+ {
1167
+ "epoch": 1.3598786870811526,
1168
+ "grad_norm": 12.645075798034668,
1169
+ "learning_rate": 3.0372155384838385e-05,
1170
+ "loss": 2.6199,
1171
+ "step": 13900
1172
+ },
1173
+ {
1174
+ "epoch": 1.3696619869882112,
1175
+ "grad_norm": 20.150625228881836,
1176
+ "learning_rate": 3.0190969705754456e-05,
1177
+ "loss": 2.5567,
1178
+ "step": 14000
1179
+ },
1180
+ {
1181
+ "epoch": 1.3696619869882112,
1182
+ "eval_runtime": 181.9086,
1183
+ "eval_samples_per_second": 112.375,
1184
+ "eval_steps_per_second": 14.051,
1185
+ "step": 14000
1186
+ },
1187
+ {
1188
+ "epoch": 1.3794452868952698,
1189
+ "grad_norm": 19.111286163330078,
1190
+ "learning_rate": 3.0009784026670535e-05,
1191
+ "loss": 2.59,
1192
+ "step": 14100
1193
+ },
1194
+ {
1195
+ "epoch": 1.3892285868023284,
1196
+ "grad_norm": 17.12226104736328,
1197
+ "learning_rate": 2.9828598347586606e-05,
1198
+ "loss": 2.5913,
1199
+ "step": 14200
1200
+ },
1201
+ {
1202
+ "epoch": 1.399011886709387,
1203
+ "grad_norm": 19.741445541381836,
1204
+ "learning_rate": 2.9647412668502684e-05,
1205
+ "loss": 2.5617,
1206
+ "step": 14300
1207
+ },
1208
+ {
1209
+ "epoch": 1.4087951866164456,
1210
+ "grad_norm": 17.605525970458984,
1211
+ "learning_rate": 2.946622698941876e-05,
1212
+ "loss": 2.6077,
1213
+ "step": 14400
1214
+ },
1215
+ {
1216
+ "epoch": 1.4185784865235043,
1217
+ "grad_norm": 17.433218002319336,
1218
+ "learning_rate": 2.928504131033483e-05,
1219
+ "loss": 2.5713,
1220
+ "step": 14500
1221
+ },
1222
+ {
1223
+ "epoch": 1.4185784865235043,
1224
+ "eval_runtime": 181.9305,
1225
+ "eval_samples_per_second": 112.362,
1226
+ "eval_steps_per_second": 14.049,
1227
+ "step": 14500
1228
+ },
1229
+ {
1230
+ "epoch": 1.428361786430563,
1231
+ "grad_norm": 15.442538261413574,
1232
+ "learning_rate": 2.910385563125091e-05,
1233
+ "loss": 2.6499,
1234
+ "step": 14600
1235
+ },
1236
+ {
1237
+ "epoch": 1.4381450863376217,
1238
+ "grad_norm": 15.078730583190918,
1239
+ "learning_rate": 2.892266995216698e-05,
1240
+ "loss": 2.6517,
1241
+ "step": 14700
1242
+ },
1243
+ {
1244
+ "epoch": 1.4479283862446803,
1245
+ "grad_norm": 23.07891273498535,
1246
+ "learning_rate": 2.874148427308306e-05,
1247
+ "loss": 2.594,
1248
+ "step": 14800
1249
+ },
1250
+ {
1251
+ "epoch": 1.457711686151739,
1252
+ "grad_norm": 16.707923889160156,
1253
+ "learning_rate": 2.856029859399913e-05,
1254
+ "loss": 2.6613,
1255
+ "step": 14900
1256
+ },
1257
+ {
1258
+ "epoch": 1.4674949860587976,
1259
+ "grad_norm": 16.731164932250977,
1260
+ "learning_rate": 2.8379112914915208e-05,
1261
+ "loss": 2.5927,
1262
+ "step": 15000
1263
+ },
1264
+ {
1265
+ "epoch": 1.4674949860587976,
1266
+ "eval_runtime": 181.9649,
1267
+ "eval_samples_per_second": 112.34,
1268
+ "eval_steps_per_second": 14.047,
1269
+ "step": 15000
1270
+ },
1271
+ {
1272
+ "epoch": 1.4772782859658564,
1273
+ "grad_norm": 16.020864486694336,
1274
+ "learning_rate": 2.819792723583128e-05,
1275
+ "loss": 2.6464,
1276
+ "step": 15100
1277
+ },
1278
+ {
1279
+ "epoch": 1.4870615858729148,
1280
+ "grad_norm": 16.674760818481445,
1281
+ "learning_rate": 2.8016741556747354e-05,
1282
+ "loss": 2.5853,
1283
+ "step": 15200
1284
+ },
1285
+ {
1286
+ "epoch": 1.4968448857799737,
1287
+ "grad_norm": 16.890748977661133,
1288
+ "learning_rate": 2.7835555877663432e-05,
1289
+ "loss": 2.5748,
1290
+ "step": 15300
1291
+ },
1292
+ {
1293
+ "epoch": 1.5066281856870323,
1294
+ "grad_norm": 20.217845916748047,
1295
+ "learning_rate": 2.7654370198579504e-05,
1296
+ "loss": 2.6204,
1297
+ "step": 15400
1298
+ },
1299
+ {
1300
+ "epoch": 1.516411485594091,
1301
+ "grad_norm": 20.459087371826172,
1302
+ "learning_rate": 2.7473184519495582e-05,
1303
+ "loss": 2.6103,
1304
+ "step": 15500
1305
+ },
1306
+ {
1307
+ "epoch": 1.516411485594091,
1308
+ "eval_runtime": 181.9454,
1309
+ "eval_samples_per_second": 112.352,
1310
+ "eval_steps_per_second": 14.048,
1311
+ "step": 15500
1312
+ },
1313
+ {
1314
+ "epoch": 1.5261947855011495,
1315
+ "grad_norm": 18.207612991333008,
1316
+ "learning_rate": 2.7291998840411654e-05,
1317
+ "loss": 2.5786,
1318
+ "step": 15600
1319
+ },
1320
+ {
1321
+ "epoch": 1.5359780854082081,
1322
+ "grad_norm": 18.084758758544922,
1323
+ "learning_rate": 2.7110813161327732e-05,
1324
+ "loss": 2.6535,
1325
+ "step": 15700
1326
+ },
1327
+ {
1328
+ "epoch": 1.545761385315267,
1329
+ "grad_norm": 15.03881549835205,
1330
+ "learning_rate": 2.6929627482243803e-05,
1331
+ "loss": 2.6061,
1332
+ "step": 15800
1333
+ },
1334
+ {
1335
+ "epoch": 1.5555446852223254,
1336
+ "grad_norm": 16.99995231628418,
1337
+ "learning_rate": 2.6748441803159878e-05,
1338
+ "loss": 2.6151,
1339
+ "step": 15900
1340
+ },
1341
+ {
1342
+ "epoch": 1.5653279851293842,
1343
+ "grad_norm": 15.581089973449707,
1344
+ "learning_rate": 2.6567256124075956e-05,
1345
+ "loss": 2.6163,
1346
+ "step": 16000
1347
+ },
1348
+ {
1349
+ "epoch": 1.5653279851293842,
1350
+ "eval_runtime": 181.8152,
1351
+ "eval_samples_per_second": 112.433,
1352
+ "eval_steps_per_second": 14.058,
1353
+ "step": 16000
1354
+ },
1355
+ {
1356
+ "epoch": 1.5751112850364428,
1357
+ "grad_norm": 21.4382266998291,
1358
+ "learning_rate": 2.6386070444992028e-05,
1359
+ "loss": 2.5975,
1360
+ "step": 16100
1361
+ },
1362
+ {
1363
+ "epoch": 1.5848945849435014,
1364
+ "grad_norm": 15.874536514282227,
1365
+ "learning_rate": 2.6204884765908106e-05,
1366
+ "loss": 2.5851,
1367
+ "step": 16200
1368
+ },
1369
+ {
1370
+ "epoch": 1.59467788485056,
1371
+ "grad_norm": 17.902137756347656,
1372
+ "learning_rate": 2.6023699086824177e-05,
1373
+ "loss": 2.6027,
1374
+ "step": 16300
1375
+ },
1376
+ {
1377
+ "epoch": 1.6044611847576187,
1378
+ "grad_norm": 17.04872703552246,
1379
+ "learning_rate": 2.5842513407740255e-05,
1380
+ "loss": 2.5854,
1381
+ "step": 16400
1382
+ },
1383
+ {
1384
+ "epoch": 1.6142444846646775,
1385
+ "grad_norm": 15.406013488769531,
1386
+ "learning_rate": 2.5661327728656327e-05,
1387
+ "loss": 2.5158,
1388
+ "step": 16500
1389
+ },
1390
+ {
1391
+ "epoch": 1.6142444846646775,
1392
+ "eval_runtime": 181.8647,
1393
+ "eval_samples_per_second": 112.402,
1394
+ "eval_steps_per_second": 14.054,
1395
+ "step": 16500
1396
+ },
1397
+ {
1398
+ "epoch": 1.624027784571736,
1399
+ "grad_norm": 19.62627601623535,
1400
+ "learning_rate": 2.5480142049572402e-05,
1401
+ "loss": 2.5378,
1402
+ "step": 16600
1403
+ },
1404
+ {
1405
+ "epoch": 1.6338110844787948,
1406
+ "grad_norm": 17.825178146362305,
1407
+ "learning_rate": 2.529895637048848e-05,
1408
+ "loss": 2.6162,
1409
+ "step": 16700
1410
+ },
1411
+ {
1412
+ "epoch": 1.6435943843858534,
1413
+ "grad_norm": 15.442023277282715,
1414
+ "learning_rate": 2.511777069140455e-05,
1415
+ "loss": 2.5802,
1416
+ "step": 16800
1417
+ },
1418
+ {
1419
+ "epoch": 1.653377684292912,
1420
+ "grad_norm": 18.695241928100586,
1421
+ "learning_rate": 2.4936585012320626e-05,
1422
+ "loss": 2.585,
1423
+ "step": 16900
1424
+ },
1425
+ {
1426
+ "epoch": 1.6631609841999706,
1427
+ "grad_norm": 18.992969512939453,
1428
+ "learning_rate": 2.4755399333236704e-05,
1429
+ "loss": 2.5448,
1430
+ "step": 17000
1431
+ },
1432
+ {
1433
+ "epoch": 1.6631609841999706,
1434
+ "eval_runtime": 181.91,
1435
+ "eval_samples_per_second": 112.374,
1436
+ "eval_steps_per_second": 14.051,
1437
+ "step": 17000
1438
+ },
1439
+ {
1440
+ "epoch": 1.6729442841070292,
1441
+ "grad_norm": 19.065349578857422,
1442
+ "learning_rate": 2.457421365415278e-05,
1443
+ "loss": 2.6565,
1444
+ "step": 17100
1445
+ },
1446
+ {
1447
+ "epoch": 1.682727584014088,
1448
+ "grad_norm": 20.110734939575195,
1449
+ "learning_rate": 2.439302797506885e-05,
1450
+ "loss": 2.5519,
1451
+ "step": 17200
1452
+ },
1453
+ {
1454
+ "epoch": 1.6925108839211465,
1455
+ "grad_norm": 15.886931419372559,
1456
+ "learning_rate": 2.4211842295984925e-05,
1457
+ "loss": 2.5589,
1458
+ "step": 17300
1459
+ },
1460
+ {
1461
+ "epoch": 1.7022941838282053,
1462
+ "grad_norm": 19.213207244873047,
1463
+ "learning_rate": 2.4030656616901e-05,
1464
+ "loss": 2.5714,
1465
+ "step": 17400
1466
+ },
1467
+ {
1468
+ "epoch": 1.712077483735264,
1469
+ "grad_norm": 17.117481231689453,
1470
+ "learning_rate": 2.3849470937817075e-05,
1471
+ "loss": 2.6682,
1472
+ "step": 17500
1473
+ },
1474
+ {
1475
+ "epoch": 1.712077483735264,
1476
+ "eval_runtime": 181.766,
1477
+ "eval_samples_per_second": 112.463,
1478
+ "eval_steps_per_second": 14.062,
1479
+ "step": 17500
1480
+ },
1481
+ {
1482
+ "epoch": 1.7218607836423225,
1483
+ "grad_norm": 17.19162940979004,
1484
+ "learning_rate": 2.366828525873315e-05,
1485
+ "loss": 2.5591,
1486
+ "step": 17600
1487
+ },
1488
+ {
1489
+ "epoch": 1.7316440835493812,
1490
+ "grad_norm": 15.454411506652832,
1491
+ "learning_rate": 2.3487099579649225e-05,
1492
+ "loss": 2.469,
1493
+ "step": 17700
1494
+ },
1495
+ {
1496
+ "epoch": 1.7414273834564398,
1497
+ "grad_norm": 15.227791786193848,
1498
+ "learning_rate": 2.3305913900565303e-05,
1499
+ "loss": 2.664,
1500
+ "step": 17800
1501
+ },
1502
+ {
1503
+ "epoch": 1.7512106833634986,
1504
+ "grad_norm": 18.5739688873291,
1505
+ "learning_rate": 2.3124728221481374e-05,
1506
+ "loss": 2.5991,
1507
+ "step": 17900
1508
+ },
1509
+ {
1510
+ "epoch": 1.760993983270557,
1511
+ "grad_norm": 12.589066505432129,
1512
+ "learning_rate": 2.294354254239745e-05,
1513
+ "loss": 2.6593,
1514
+ "step": 18000
1515
+ },
1516
+ {
1517
+ "epoch": 1.760993983270557,
1518
+ "eval_runtime": 181.9699,
1519
+ "eval_samples_per_second": 112.337,
1520
+ "eval_steps_per_second": 14.046,
1521
+ "step": 18000
1522
+ },
1523
+ {
1524
+ "epoch": 1.7707772831776158,
1525
+ "grad_norm": 20.695772171020508,
1526
+ "learning_rate": 2.2762356863313524e-05,
1527
+ "loss": 2.5555,
1528
+ "step": 18100
1529
+ },
1530
+ {
1531
+ "epoch": 1.7805605830846745,
1532
+ "grad_norm": 12.731703758239746,
1533
+ "learning_rate": 2.25811711842296e-05,
1534
+ "loss": 2.4617,
1535
+ "step": 18200
1536
+ },
1537
+ {
1538
+ "epoch": 1.790343882991733,
1539
+ "grad_norm": 18.506074905395508,
1540
+ "learning_rate": 2.2399985505145674e-05,
1541
+ "loss": 2.6061,
1542
+ "step": 18300
1543
+ },
1544
+ {
1545
+ "epoch": 1.800127182898792,
1546
+ "grad_norm": 14.8694486618042,
1547
+ "learning_rate": 2.221879982606175e-05,
1548
+ "loss": 2.5779,
1549
+ "step": 18400
1550
+ },
1551
+ {
1552
+ "epoch": 1.8099104828058503,
1553
+ "grad_norm": 22.47985076904297,
1554
+ "learning_rate": 2.2037614146977827e-05,
1555
+ "loss": 2.5012,
1556
+ "step": 18500
1557
+ },
1558
+ {
1559
+ "epoch": 1.8099104828058503,
1560
+ "eval_runtime": 182.3919,
1561
+ "eval_samples_per_second": 112.077,
1562
+ "eval_steps_per_second": 14.014,
1563
+ "step": 18500
1564
+ },
1565
+ {
1566
+ "epoch": 1.8196937827129092,
1567
+ "grad_norm": 25.74334144592285,
1568
+ "learning_rate": 2.1856428467893898e-05,
1569
+ "loss": 2.5265,
1570
+ "step": 18600
1571
+ },
1572
+ {
1573
+ "epoch": 1.8294770826199676,
1574
+ "grad_norm": 18.477630615234375,
1575
+ "learning_rate": 2.1675242788809973e-05,
1576
+ "loss": 2.5555,
1577
+ "step": 18700
1578
+ },
1579
+ {
1580
+ "epoch": 1.8392603825270264,
1581
+ "grad_norm": 14.832316398620605,
1582
+ "learning_rate": 2.1494057109726048e-05,
1583
+ "loss": 2.4609,
1584
+ "step": 18800
1585
+ },
1586
+ {
1587
+ "epoch": 1.849043682434085,
1588
+ "grad_norm": 17.025096893310547,
1589
+ "learning_rate": 2.1312871430642123e-05,
1590
+ "loss": 2.5119,
1591
+ "step": 18900
1592
+ },
1593
+ {
1594
+ "epoch": 1.8588269823411436,
1595
+ "grad_norm": 16.852436065673828,
1596
+ "learning_rate": 2.1131685751558197e-05,
1597
+ "loss": 2.5369,
1598
+ "step": 19000
1599
+ },
1600
+ {
1601
+ "epoch": 1.8588269823411436,
1602
+ "eval_runtime": 181.7443,
1603
+ "eval_samples_per_second": 112.477,
1604
+ "eval_steps_per_second": 14.064,
1605
+ "step": 19000
1606
+ },
1607
+ {
1608
+ "epoch": 1.8686102822482025,
1609
+ "grad_norm": 15.160259246826172,
1610
+ "learning_rate": 2.0950500072474272e-05,
1611
+ "loss": 2.6297,
1612
+ "step": 19100
1613
+ },
1614
+ {
1615
+ "epoch": 1.8783935821552609,
1616
+ "grad_norm": 15.909671783447266,
1617
+ "learning_rate": 2.0769314393390347e-05,
1618
+ "loss": 2.4696,
1619
+ "step": 19200
1620
+ },
1621
+ {
1622
+ "epoch": 1.8881768820623197,
1623
+ "grad_norm": 14.201844215393066,
1624
+ "learning_rate": 2.0588128714306422e-05,
1625
+ "loss": 2.5653,
1626
+ "step": 19300
1627
+ },
1628
+ {
1629
+ "epoch": 1.8979601819693783,
1630
+ "grad_norm": 16.351415634155273,
1631
+ "learning_rate": 2.0406943035222497e-05,
1632
+ "loss": 2.4962,
1633
+ "step": 19400
1634
+ },
1635
+ {
1636
+ "epoch": 1.907743481876437,
1637
+ "grad_norm": 16.943771362304688,
1638
+ "learning_rate": 2.022575735613857e-05,
1639
+ "loss": 2.5091,
1640
+ "step": 19500
1641
+ },
1642
+ {
1643
+ "epoch": 1.907743481876437,
1644
+ "eval_runtime": 181.6486,
1645
+ "eval_samples_per_second": 112.536,
1646
+ "eval_steps_per_second": 14.071,
1647
+ "step": 19500
1648
+ },
1649
+ {
1650
+ "epoch": 1.9175267817834956,
1651
+ "grad_norm": 15.006349563598633,
1652
+ "learning_rate": 2.0044571677054646e-05,
1653
+ "loss": 2.5214,
1654
+ "step": 19600
1655
+ },
1656
+ {
1657
+ "epoch": 1.9273100816905542,
1658
+ "grad_norm": 17.305580139160156,
1659
+ "learning_rate": 1.986338599797072e-05,
1660
+ "loss": 2.4989,
1661
+ "step": 19700
1662
+ },
1663
+ {
1664
+ "epoch": 1.937093381597613,
1665
+ "grad_norm": 17.28044891357422,
1666
+ "learning_rate": 1.9682200318886796e-05,
1667
+ "loss": 2.4008,
1668
+ "step": 19800
1669
+ },
1670
+ {
1671
+ "epoch": 1.9468766815046714,
1672
+ "grad_norm": 18.25079917907715,
1673
+ "learning_rate": 1.950101463980287e-05,
1674
+ "loss": 2.6015,
1675
+ "step": 19900
1676
+ },
1677
+ {
1678
+ "epoch": 1.9566599814117303,
1679
+ "grad_norm": 20.741668701171875,
1680
+ "learning_rate": 1.9319828960718946e-05,
1681
+ "loss": 2.4081,
1682
+ "step": 20000
1683
+ },
1684
+ {
1685
+ "epoch": 1.9566599814117303,
1686
+ "eval_runtime": 181.7745,
1687
+ "eval_samples_per_second": 112.458,
1688
+ "eval_steps_per_second": 14.061,
1689
+ "step": 20000
1690
+ },
1691
+ {
1692
+ "epoch": 1.9664432813187889,
1693
+ "grad_norm": 16.1226863861084,
1694
+ "learning_rate": 1.913864328163502e-05,
1695
+ "loss": 2.5418,
1696
+ "step": 20100
1697
+ },
1698
+ {
1699
+ "epoch": 1.9762265812258475,
1700
+ "grad_norm": 13.914982795715332,
1701
+ "learning_rate": 1.8957457602551095e-05,
1702
+ "loss": 2.5248,
1703
+ "step": 20200
1704
+ },
1705
+ {
1706
+ "epoch": 1.986009881132906,
1707
+ "grad_norm": 15.072690963745117,
1708
+ "learning_rate": 1.877627192346717e-05,
1709
+ "loss": 2.5488,
1710
+ "step": 20300
1711
+ },
1712
+ {
1713
+ "epoch": 1.9957931810399647,
1714
+ "grad_norm": 15.510763168334961,
1715
+ "learning_rate": 1.8595086244383245e-05,
1716
+ "loss": 2.4605,
1717
+ "step": 20400
1718
+ },
1719
+ {
1720
+ "epoch": 2.0055764809470236,
1721
+ "grad_norm": 18.463842391967773,
1722
+ "learning_rate": 1.841390056529932e-05,
1723
+ "loss": 2.522,
1724
+ "step": 20500
1725
+ },
1726
+ {
1727
+ "epoch": 2.0055764809470236,
1728
+ "eval_runtime": 182.07,
1729
+ "eval_samples_per_second": 112.276,
1730
+ "eval_steps_per_second": 14.039,
1731
+ "step": 20500
1732
+ },
1733
+ {
1734
+ "epoch": 2.015359780854082,
1735
+ "grad_norm": 16.670269012451172,
1736
+ "learning_rate": 1.8232714886215394e-05,
1737
+ "loss": 2.5585,
1738
+ "step": 20600
1739
+ },
1740
+ {
1741
+ "epoch": 2.025143080761141,
1742
+ "grad_norm": 20.60368537902832,
1743
+ "learning_rate": 1.805152920713147e-05,
1744
+ "loss": 2.5381,
1745
+ "step": 20700
1746
+ },
1747
+ {
1748
+ "epoch": 2.034926380668199,
1749
+ "grad_norm": 15.686981201171875,
1750
+ "learning_rate": 1.7870343528047544e-05,
1751
+ "loss": 2.5721,
1752
+ "step": 20800
1753
+ },
1754
+ {
1755
+ "epoch": 2.044709680575258,
1756
+ "grad_norm": 14.691718101501465,
1757
+ "learning_rate": 1.768915784896362e-05,
1758
+ "loss": 2.5187,
1759
+ "step": 20900
1760
+ },
1761
+ {
1762
+ "epoch": 2.054492980482317,
1763
+ "grad_norm": 16.31734848022461,
1764
+ "learning_rate": 1.7507972169879694e-05,
1765
+ "loss": 2.5202,
1766
+ "step": 21000
1767
+ },
1768
+ {
1769
+ "epoch": 2.054492980482317,
1770
+ "eval_runtime": 181.9896,
1771
+ "eval_samples_per_second": 112.325,
1772
+ "eval_steps_per_second": 14.045,
1773
+ "step": 21000
1774
+ },
1775
+ {
1776
+ "epoch": 2.0642762803893753,
1777
+ "grad_norm": 12.698554992675781,
1778
+ "learning_rate": 1.732678649079577e-05,
1779
+ "loss": 2.4228,
1780
+ "step": 21100
1781
+ },
1782
+ {
1783
+ "epoch": 2.074059580296434,
1784
+ "grad_norm": 16.34201431274414,
1785
+ "learning_rate": 1.7145600811711843e-05,
1786
+ "loss": 2.3963,
1787
+ "step": 21200
1788
+ },
1789
+ {
1790
+ "epoch": 2.0838428802034925,
1791
+ "grad_norm": 16.52840232849121,
1792
+ "learning_rate": 1.6964415132627918e-05,
1793
+ "loss": 2.4759,
1794
+ "step": 21300
1795
+ },
1796
+ {
1797
+ "epoch": 2.0936261801105513,
1798
+ "grad_norm": 14.856452941894531,
1799
+ "learning_rate": 1.6783229453543993e-05,
1800
+ "loss": 2.4675,
1801
+ "step": 21400
1802
+ },
1803
+ {
1804
+ "epoch": 2.1034094800176097,
1805
+ "grad_norm": 19.68895721435547,
1806
+ "learning_rate": 1.6602043774460068e-05,
1807
+ "loss": 2.5324,
1808
+ "step": 21500
1809
+ },
1810
+ {
1811
+ "epoch": 2.1034094800176097,
1812
+ "eval_runtime": 182.1877,
1813
+ "eval_samples_per_second": 112.203,
1814
+ "eval_steps_per_second": 14.029,
1815
+ "step": 21500
1816
+ },
1817
+ {
1818
+ "epoch": 2.1131927799246686,
1819
+ "grad_norm": 23.248056411743164,
1820
+ "learning_rate": 1.6420858095376143e-05,
1821
+ "loss": 2.5231,
1822
+ "step": 21600
1823
+ },
1824
+ {
1825
+ "epoch": 2.1229760798317274,
1826
+ "grad_norm": 25.471004486083984,
1827
+ "learning_rate": 1.6239672416292217e-05,
1828
+ "loss": 2.5871,
1829
+ "step": 21700
1830
+ },
1831
+ {
1832
+ "epoch": 2.132759379738786,
1833
+ "grad_norm": 17.794851303100586,
1834
+ "learning_rate": 1.6058486737208292e-05,
1835
+ "loss": 2.5008,
1836
+ "step": 21800
1837
+ },
1838
+ {
1839
+ "epoch": 2.1425426796458447,
1840
+ "grad_norm": 15.450346946716309,
1841
+ "learning_rate": 1.5877301058124367e-05,
1842
+ "loss": 2.4194,
1843
+ "step": 21900
1844
+ },
1845
+ {
1846
+ "epoch": 2.152325979552903,
1847
+ "grad_norm": 13.243645668029785,
1848
+ "learning_rate": 1.5696115379040442e-05,
1849
+ "loss": 2.5018,
1850
+ "step": 22000
1851
+ },
1852
+ {
1853
+ "epoch": 2.152325979552903,
1854
+ "eval_runtime": 181.9841,
1855
+ "eval_samples_per_second": 112.328,
1856
+ "eval_steps_per_second": 14.045,
1857
+ "step": 22000
1858
+ },
1859
+ {
1860
+ "epoch": 2.162109279459962,
1861
+ "grad_norm": 16.996198654174805,
1862
+ "learning_rate": 1.5514929699956517e-05,
1863
+ "loss": 2.4492,
1864
+ "step": 22100
1865
+ },
1866
+ {
1867
+ "epoch": 2.1718925793670203,
1868
+ "grad_norm": 20.05558967590332,
1869
+ "learning_rate": 1.5333744020872588e-05,
1870
+ "loss": 2.489,
1871
+ "step": 22200
1872
+ },
1873
+ {
1874
+ "epoch": 2.181675879274079,
1875
+ "grad_norm": 15.66326904296875,
1876
+ "learning_rate": 1.5152558341788666e-05,
1877
+ "loss": 2.5089,
1878
+ "step": 22300
1879
+ },
1880
+ {
1881
+ "epoch": 2.191459179181138,
1882
+ "grad_norm": 17.83564567565918,
1883
+ "learning_rate": 1.4971372662704741e-05,
1884
+ "loss": 2.4945,
1885
+ "step": 22400
1886
+ },
1887
+ {
1888
+ "epoch": 2.2012424790881964,
1889
+ "grad_norm": 21.466899871826172,
1890
+ "learning_rate": 1.4790186983620816e-05,
1891
+ "loss": 2.5467,
1892
+ "step": 22500
1893
+ },
1894
+ {
1895
+ "epoch": 2.2012424790881964,
1896
+ "eval_runtime": 182.8328,
1897
+ "eval_samples_per_second": 111.807,
1898
+ "eval_steps_per_second": 13.98,
1899
+ "step": 22500
1900
+ },
1901
+ {
1902
+ "epoch": 2.211025778995255,
1903
+ "grad_norm": 17.91064453125,
1904
+ "learning_rate": 1.4609001304536891e-05,
1905
+ "loss": 2.5144,
1906
+ "step": 22600
1907
+ },
1908
+ {
1909
+ "epoch": 2.2208090789023136,
1910
+ "grad_norm": 17.678396224975586,
1911
+ "learning_rate": 1.4427815625452964e-05,
1912
+ "loss": 2.5018,
1913
+ "step": 22700
1914
+ },
1915
+ {
1916
+ "epoch": 2.2305923788093724,
1917
+ "grad_norm": 17.510461807250977,
1918
+ "learning_rate": 1.4246629946369039e-05,
1919
+ "loss": 2.4228,
1920
+ "step": 22800
1921
+ },
1922
+ {
1923
+ "epoch": 2.240375678716431,
1924
+ "grad_norm": 24.923967361450195,
1925
+ "learning_rate": 1.4065444267285114e-05,
1926
+ "loss": 2.5249,
1927
+ "step": 22900
1928
+ },
1929
+ {
1930
+ "epoch": 2.2501589786234897,
1931
+ "grad_norm": 17.82384490966797,
1932
+ "learning_rate": 1.388425858820119e-05,
1933
+ "loss": 2.4282,
1934
+ "step": 23000
1935
+ },
1936
+ {
1937
+ "epoch": 2.2501589786234897,
1938
+ "eval_runtime": 182.0459,
1939
+ "eval_samples_per_second": 112.29,
1940
+ "eval_steps_per_second": 14.04,
1941
+ "step": 23000
1942
+ },
1943
+ {
1944
+ "epoch": 2.2599422785305485,
1945
+ "grad_norm": 16.13028335571289,
1946
+ "learning_rate": 1.3703072909117265e-05,
1947
+ "loss": 2.4472,
1948
+ "step": 23100
1949
+ },
1950
+ {
1951
+ "epoch": 2.269725578437607,
1952
+ "grad_norm": 15.137242317199707,
1953
+ "learning_rate": 1.352188723003334e-05,
1954
+ "loss": 2.5985,
1955
+ "step": 23200
1956
+ },
1957
+ {
1958
+ "epoch": 2.2795088783446658,
1959
+ "grad_norm": 16.187530517578125,
1960
+ "learning_rate": 1.3340701550949415e-05,
1961
+ "loss": 2.4862,
1962
+ "step": 23300
1963
+ },
1964
+ {
1965
+ "epoch": 2.289292178251724,
1966
+ "grad_norm": 18.84433937072754,
1967
+ "learning_rate": 1.3159515871865488e-05,
1968
+ "loss": 2.516,
1969
+ "step": 23400
1970
+ },
1971
+ {
1972
+ "epoch": 2.299075478158783,
1973
+ "grad_norm": 20.209121704101562,
1974
+ "learning_rate": 1.2978330192781563e-05,
1975
+ "loss": 2.5031,
1976
+ "step": 23500
1977
+ },
1978
+ {
1979
+ "epoch": 2.299075478158783,
1980
+ "eval_runtime": 181.9806,
1981
+ "eval_samples_per_second": 112.331,
1982
+ "eval_steps_per_second": 14.045,
1983
+ "step": 23500
1984
+ },
1985
+ {
1986
+ "epoch": 2.308858778065842,
1987
+ "grad_norm": 67.4502182006836,
1988
+ "learning_rate": 1.2797144513697637e-05,
1989
+ "loss": 2.4491,
1990
+ "step": 23600
1991
+ },
1992
+ {
1993
+ "epoch": 2.3186420779729002,
1994
+ "grad_norm": 14.940401077270508,
1995
+ "learning_rate": 1.2615958834613712e-05,
1996
+ "loss": 2.5669,
1997
+ "step": 23700
1998
+ },
1999
+ {
2000
+ "epoch": 2.328425377879959,
2001
+ "grad_norm": 16.591793060302734,
2002
+ "learning_rate": 1.2434773155529787e-05,
2003
+ "loss": 2.4565,
2004
+ "step": 23800
2005
+ },
2006
+ {
2007
+ "epoch": 2.3382086777870175,
2008
+ "grad_norm": 16.798791885375977,
2009
+ "learning_rate": 1.2253587476445862e-05,
2010
+ "loss": 2.4046,
2011
+ "step": 23900
2012
+ },
2013
+ {
2014
+ "epoch": 2.3479919776940763,
2015
+ "grad_norm": 17.712255477905273,
2016
+ "learning_rate": 1.2072401797361937e-05,
2017
+ "loss": 2.4453,
2018
+ "step": 24000
2019
+ },
2020
+ {
2021
+ "epoch": 2.3479919776940763,
2022
+ "eval_runtime": 182.0401,
2023
+ "eval_samples_per_second": 112.294,
2024
+ "eval_steps_per_second": 14.041,
2025
+ "step": 24000
2026
+ },
2027
+ {
2028
+ "epoch": 2.3577752776011347,
2029
+ "grad_norm": 18.64284324645996,
2030
+ "learning_rate": 1.1891216118278011e-05,
2031
+ "loss": 2.3973,
2032
+ "step": 24100
2033
+ },
2034
+ {
2035
+ "epoch": 2.3675585775081935,
2036
+ "grad_norm": 18.185895919799805,
2037
+ "learning_rate": 1.1710030439194086e-05,
2038
+ "loss": 2.5045,
2039
+ "step": 24200
2040
+ },
2041
+ {
2042
+ "epoch": 2.377341877415252,
2043
+ "grad_norm": 23.201522827148438,
2044
+ "learning_rate": 1.1528844760110163e-05,
2045
+ "loss": 2.5402,
2046
+ "step": 24300
2047
+ },
2048
+ {
2049
+ "epoch": 2.3871251773223108,
2050
+ "grad_norm": 21.606412887573242,
2051
+ "learning_rate": 1.1347659081026236e-05,
2052
+ "loss": 2.4285,
2053
+ "step": 24400
2054
+ },
2055
+ {
2056
+ "epoch": 2.3969084772293696,
2057
+ "grad_norm": 16.318761825561523,
2058
+ "learning_rate": 1.116647340194231e-05,
2059
+ "loss": 2.5509,
2060
+ "step": 24500
2061
+ },
2062
+ {
2063
+ "epoch": 2.3969084772293696,
2064
+ "eval_runtime": 182.0431,
2065
+ "eval_samples_per_second": 112.292,
2066
+ "eval_steps_per_second": 14.041,
2067
+ "step": 24500
2068
+ },
2069
+ {
2070
+ "epoch": 2.406691777136428,
2071
+ "grad_norm": 17.779014587402344,
2072
+ "learning_rate": 1.0985287722858386e-05,
2073
+ "loss": 2.4245,
2074
+ "step": 24600
2075
+ },
2076
+ {
2077
+ "epoch": 2.416475077043487,
2078
+ "grad_norm": 18.44321060180664,
2079
+ "learning_rate": 1.080410204377446e-05,
2080
+ "loss": 2.5223,
2081
+ "step": 24700
2082
+ },
2083
+ {
2084
+ "epoch": 2.4262583769505452,
2085
+ "grad_norm": 24.017047882080078,
2086
+ "learning_rate": 1.0622916364690535e-05,
2087
+ "loss": 2.4846,
2088
+ "step": 24800
2089
+ },
2090
+ {
2091
+ "epoch": 2.436041676857604,
2092
+ "grad_norm": 14.89560604095459,
2093
+ "learning_rate": 1.044173068560661e-05,
2094
+ "loss": 2.5922,
2095
+ "step": 24900
2096
+ },
2097
+ {
2098
+ "epoch": 2.445824976764663,
2099
+ "grad_norm": 15.532561302185059,
2100
+ "learning_rate": 1.0260545006522685e-05,
2101
+ "loss": 2.3976,
2102
+ "step": 25000
2103
+ },
2104
+ {
2105
+ "epoch": 2.445824976764663,
2106
+ "eval_runtime": 182.1033,
2107
+ "eval_samples_per_second": 112.255,
2108
+ "eval_steps_per_second": 14.036,
2109
+ "step": 25000
2110
+ },
2111
+ {
2112
+ "epoch": 2.4556082766717213,
2113
+ "grad_norm": 18.041282653808594,
2114
+ "learning_rate": 1.007935932743876e-05,
2115
+ "loss": 2.4731,
2116
+ "step": 25100
2117
+ },
2118
+ {
2119
+ "epoch": 2.46539157657878,
2120
+ "grad_norm": 13.40858268737793,
2121
+ "learning_rate": 9.898173648354834e-06,
2122
+ "loss": 2.4838,
2123
+ "step": 25200
2124
+ },
2125
+ {
2126
+ "epoch": 2.4751748764858386,
2127
+ "grad_norm": 17.450841903686523,
2128
+ "learning_rate": 9.71698796927091e-06,
2129
+ "loss": 2.3999,
2130
+ "step": 25300
2131
+ },
2132
+ {
2133
+ "epoch": 2.4849581763928974,
2134
+ "grad_norm": 17.556467056274414,
2135
+ "learning_rate": 9.535802290186984e-06,
2136
+ "loss": 2.3867,
2137
+ "step": 25400
2138
+ },
2139
+ {
2140
+ "epoch": 2.494741476299956,
2141
+ "grad_norm": 18.578310012817383,
2142
+ "learning_rate": 9.354616611103059e-06,
2143
+ "loss": 2.4546,
2144
+ "step": 25500
2145
+ },
2146
+ {
2147
+ "epoch": 2.494741476299956,
2148
+ "eval_runtime": 182.0338,
2149
+ "eval_samples_per_second": 112.298,
2150
+ "eval_steps_per_second": 14.041,
2151
+ "step": 25500
2152
+ },
2153
+ {
2154
+ "epoch": 2.5045247762070146,
2155
+ "grad_norm": 14.936469078063965,
2156
+ "learning_rate": 9.173430932019134e-06,
2157
+ "loss": 2.5562,
2158
+ "step": 25600
2159
+ },
2160
+ {
2161
+ "epoch": 2.514308076114073,
2162
+ "grad_norm": 17.527040481567383,
2163
+ "learning_rate": 8.992245252935209e-06,
2164
+ "loss": 2.4008,
2165
+ "step": 25700
2166
+ },
2167
+ {
2168
+ "epoch": 2.524091376021132,
2169
+ "grad_norm": 12.91336727142334,
2170
+ "learning_rate": 8.811059573851283e-06,
2171
+ "loss": 2.4655,
2172
+ "step": 25800
2173
+ },
2174
+ {
2175
+ "epoch": 2.5338746759281907,
2176
+ "grad_norm": 15.168461799621582,
2177
+ "learning_rate": 8.629873894767358e-06,
2178
+ "loss": 2.4468,
2179
+ "step": 25900
2180
+ },
2181
+ {
2182
+ "epoch": 2.543657975835249,
2183
+ "grad_norm": 17.5390682220459,
2184
+ "learning_rate": 8.448688215683433e-06,
2185
+ "loss": 2.4836,
2186
+ "step": 26000
2187
+ },
2188
+ {
2189
+ "epoch": 2.543657975835249,
2190
+ "eval_runtime": 182.1148,
2191
+ "eval_samples_per_second": 112.248,
2192
+ "eval_steps_per_second": 14.035,
2193
+ "step": 26000
2194
+ },
2195
+ {
2196
+ "epoch": 2.553441275742308,
2197
+ "grad_norm": 15.126510620117188,
2198
+ "learning_rate": 8.267502536599508e-06,
2199
+ "loss": 2.387,
2200
+ "step": 26100
2201
+ },
2202
+ {
2203
+ "epoch": 2.5632245756493663,
2204
+ "grad_norm": 15.374293327331543,
2205
+ "learning_rate": 8.086316857515583e-06,
2206
+ "loss": 2.3652,
2207
+ "step": 26200
2208
+ },
2209
+ {
2210
+ "epoch": 2.573007875556425,
2211
+ "grad_norm": 15.498108863830566,
2212
+ "learning_rate": 7.905131178431657e-06,
2213
+ "loss": 2.4749,
2214
+ "step": 26300
2215
+ },
2216
+ {
2217
+ "epoch": 2.582791175463484,
2218
+ "grad_norm": 16.221315383911133,
2219
+ "learning_rate": 7.723945499347732e-06,
2220
+ "loss": 2.4567,
2221
+ "step": 26400
2222
+ },
2223
+ {
2224
+ "epoch": 2.5925744753705424,
2225
+ "grad_norm": 18.839122772216797,
2226
+ "learning_rate": 7.542759820263806e-06,
2227
+ "loss": 2.3554,
2228
+ "step": 26500
2229
+ },
2230
+ {
2231
+ "epoch": 2.5925744753705424,
2232
+ "eval_runtime": 181.9597,
2233
+ "eval_samples_per_second": 112.344,
2234
+ "eval_steps_per_second": 14.047,
2235
+ "step": 26500
2236
+ },
2237
+ {
2238
+ "epoch": 2.6023577752776013,
2239
+ "grad_norm": 22.626708984375,
2240
+ "learning_rate": 7.361574141179882e-06,
2241
+ "loss": 2.502,
2242
+ "step": 26600
2243
+ },
2244
+ {
2245
+ "epoch": 2.6121410751846597,
2246
+ "grad_norm": 16.519880294799805,
2247
+ "learning_rate": 7.180388462095957e-06,
2248
+ "loss": 2.5034,
2249
+ "step": 26700
2250
+ },
2251
+ {
2252
+ "epoch": 2.6219243750917185,
2253
+ "grad_norm": 27.421489715576172,
2254
+ "learning_rate": 6.999202783012031e-06,
2255
+ "loss": 2.5276,
2256
+ "step": 26800
2257
+ },
2258
+ {
2259
+ "epoch": 2.6317076749987773,
2260
+ "grad_norm": 15.274630546569824,
2261
+ "learning_rate": 6.8180171039281055e-06,
2262
+ "loss": 2.4121,
2263
+ "step": 26900
2264
+ },
2265
+ {
2266
+ "epoch": 2.6414909749058357,
2267
+ "grad_norm": 15.751582145690918,
2268
+ "learning_rate": 6.636831424844181e-06,
2269
+ "loss": 2.5799,
2270
+ "step": 27000
2271
+ },
2272
+ {
2273
+ "epoch": 2.6414909749058357,
2274
+ "eval_runtime": 182.0873,
2275
+ "eval_samples_per_second": 112.265,
2276
+ "eval_steps_per_second": 14.037,
2277
+ "step": 27000
2278
+ },
2279
+ {
2280
+ "epoch": 2.651274274812894,
2281
+ "grad_norm": 16.674850463867188,
2282
+ "learning_rate": 6.455645745760255e-06,
2283
+ "loss": 2.3872,
2284
+ "step": 27100
2285
+ },
2286
+ {
2287
+ "epoch": 2.661057574719953,
2288
+ "grad_norm": 12.62803840637207,
2289
+ "learning_rate": 6.27446006667633e-06,
2290
+ "loss": 2.4,
2291
+ "step": 27200
2292
+ },
2293
+ {
2294
+ "epoch": 2.670840874627012,
2295
+ "grad_norm": 18.055158615112305,
2296
+ "learning_rate": 6.093274387592405e-06,
2297
+ "loss": 2.4681,
2298
+ "step": 27300
2299
+ },
2300
+ {
2301
+ "epoch": 2.68062417453407,
2302
+ "grad_norm": 17.21278190612793,
2303
+ "learning_rate": 5.91208870850848e-06,
2304
+ "loss": 2.5441,
2305
+ "step": 27400
2306
+ },
2307
+ {
2308
+ "epoch": 2.690407474441129,
2309
+ "grad_norm": 20.945236206054688,
2310
+ "learning_rate": 5.7309030294245544e-06,
2311
+ "loss": 2.4388,
2312
+ "step": 27500
2313
+ },
2314
+ {
2315
+ "epoch": 2.690407474441129,
2316
+ "eval_runtime": 182.1279,
2317
+ "eval_samples_per_second": 112.24,
2318
+ "eval_steps_per_second": 14.034,
2319
+ "step": 27500
2320
+ },
2321
+ {
2322
+ "epoch": 2.7001907743481874,
2323
+ "grad_norm": 23.483661651611328,
2324
+ "learning_rate": 5.549717350340629e-06,
2325
+ "loss": 2.4589,
2326
+ "step": 27600
2327
+ },
2328
+ {
2329
+ "epoch": 2.7099740742552463,
2330
+ "grad_norm": 17.954036712646484,
2331
+ "learning_rate": 5.368531671256704e-06,
2332
+ "loss": 2.4477,
2333
+ "step": 27700
2334
+ },
2335
+ {
2336
+ "epoch": 2.719757374162305,
2337
+ "grad_norm": 16.187314987182617,
2338
+ "learning_rate": 5.187345992172779e-06,
2339
+ "loss": 2.4967,
2340
+ "step": 27800
2341
+ },
2342
+ {
2343
+ "epoch": 2.7295406740693635,
2344
+ "grad_norm": 14.324910163879395,
2345
+ "learning_rate": 5.006160313088854e-06,
2346
+ "loss": 2.3921,
2347
+ "step": 27900
2348
+ },
2349
+ {
2350
+ "epoch": 2.7393239739764224,
2351
+ "grad_norm": 20.81557846069336,
2352
+ "learning_rate": 4.8249746340049285e-06,
2353
+ "loss": 2.5201,
2354
+ "step": 28000
2355
+ },
2356
+ {
2357
+ "epoch": 2.7393239739764224,
2358
+ "eval_runtime": 182.146,
2359
+ "eval_samples_per_second": 112.229,
2360
+ "eval_steps_per_second": 14.033,
2361
+ "step": 28000
2362
+ },
2363
+ {
2364
+ "epoch": 2.7491072738834808,
2365
+ "grad_norm": 18.682844161987305,
2366
+ "learning_rate": 4.643788954921003e-06,
2367
+ "loss": 2.4325,
2368
+ "step": 28100
2369
+ },
2370
+ {
2371
+ "epoch": 2.7588905737905396,
2372
+ "grad_norm": 16.227272033691406,
2373
+ "learning_rate": 4.462603275837078e-06,
2374
+ "loss": 2.3864,
2375
+ "step": 28200
2376
+ },
2377
+ {
2378
+ "epoch": 2.7686738736975984,
2379
+ "grad_norm": 16.20302963256836,
2380
+ "learning_rate": 4.281417596753152e-06,
2381
+ "loss": 2.5296,
2382
+ "step": 28300
2383
+ },
2384
+ {
2385
+ "epoch": 2.778457173604657,
2386
+ "grad_norm": 18.634096145629883,
2387
+ "learning_rate": 4.100231917669228e-06,
2388
+ "loss": 2.4514,
2389
+ "step": 28400
2390
+ },
2391
+ {
2392
+ "epoch": 2.7882404735117157,
2393
+ "grad_norm": 13.040008544921875,
2394
+ "learning_rate": 3.919046238585303e-06,
2395
+ "loss": 2.3661,
2396
+ "step": 28500
2397
+ },
2398
+ {
2399
+ "epoch": 2.7882404735117157,
2400
+ "eval_runtime": 181.9164,
2401
+ "eval_samples_per_second": 112.37,
2402
+ "eval_steps_per_second": 14.05,
2403
+ "step": 28500
2404
+ },
2405
+ {
2406
+ "epoch": 2.798023773418774,
2407
+ "grad_norm": 14.142943382263184,
2408
+ "learning_rate": 3.737860559501377e-06,
2409
+ "loss": 2.5074,
2410
+ "step": 28600
2411
+ },
2412
+ {
2413
+ "epoch": 2.807807073325833,
2414
+ "grad_norm": 17.934324264526367,
2415
+ "learning_rate": 3.5566748804174523e-06,
2416
+ "loss": 2.4224,
2417
+ "step": 28700
2418
+ },
2419
+ {
2420
+ "epoch": 2.8175903732328913,
2421
+ "grad_norm": 14.450194358825684,
2422
+ "learning_rate": 3.3754892013335267e-06,
2423
+ "loss": 2.4949,
2424
+ "step": 28800
2425
+ },
2426
+ {
2427
+ "epoch": 2.82737367313995,
2428
+ "grad_norm": 17.746837615966797,
2429
+ "learning_rate": 3.194303522249602e-06,
2430
+ "loss": 2.4153,
2431
+ "step": 28900
2432
+ },
2433
+ {
2434
+ "epoch": 2.8371569730470085,
2435
+ "grad_norm": 13.962541580200195,
2436
+ "learning_rate": 3.0131178431656763e-06,
2437
+ "loss": 2.4804,
2438
+ "step": 29000
2439
+ },
2440
+ {
2441
+ "epoch": 2.8371569730470085,
2442
+ "eval_runtime": 182.0262,
2443
+ "eval_samples_per_second": 112.303,
2444
+ "eval_steps_per_second": 14.042,
2445
+ "step": 29000
2446
+ },
2447
+ {
2448
+ "epoch": 2.8469402729540674,
2449
+ "grad_norm": 16.669286727905273,
2450
+ "learning_rate": 2.831932164081751e-06,
2451
+ "loss": 2.5397,
2452
+ "step": 29100
2453
+ },
2454
+ {
2455
+ "epoch": 2.856723572861126,
2456
+ "grad_norm": 15.421733856201172,
2457
+ "learning_rate": 2.650746484997826e-06,
2458
+ "loss": 2.4175,
2459
+ "step": 29200
2460
+ },
2461
+ {
2462
+ "epoch": 2.8665068727681846,
2463
+ "grad_norm": 14.135702133178711,
2464
+ "learning_rate": 2.4695608059139007e-06,
2465
+ "loss": 2.5069,
2466
+ "step": 29300
2467
+ },
2468
+ {
2469
+ "epoch": 2.8762901726752435,
2470
+ "grad_norm": 17.41412925720215,
2471
+ "learning_rate": 2.2883751268299756e-06,
2472
+ "loss": 2.3997,
2473
+ "step": 29400
2474
+ },
2475
+ {
2476
+ "epoch": 2.886073472582302,
2477
+ "grad_norm": 14.824533462524414,
2478
+ "learning_rate": 2.1071894477460504e-06,
2479
+ "loss": 2.3945,
2480
+ "step": 29500
2481
+ },
2482
+ {
2483
+ "epoch": 2.886073472582302,
2484
+ "eval_runtime": 181.9299,
2485
+ "eval_samples_per_second": 112.362,
2486
+ "eval_steps_per_second": 14.049,
2487
+ "step": 29500
2488
+ },
2489
+ {
2490
+ "epoch": 2.8958567724893607,
2491
+ "grad_norm": 27.31865119934082,
2492
+ "learning_rate": 1.926003768662125e-06,
2493
+ "loss": 2.45,
2494
+ "step": 29600
2495
+ },
2496
+ {
2497
+ "epoch": 2.9056400723964195,
2498
+ "grad_norm": 18.966655731201172,
2499
+ "learning_rate": 1.7448180895781998e-06,
2500
+ "loss": 2.3916,
2501
+ "step": 29700
2502
+ },
2503
+ {
2504
+ "epoch": 2.915423372303478,
2505
+ "grad_norm": 18.538440704345703,
2506
+ "learning_rate": 1.5636324104942746e-06,
2507
+ "loss": 2.4625,
2508
+ "step": 29800
2509
+ },
2510
+ {
2511
+ "epoch": 2.9252066722105368,
2512
+ "grad_norm": 21.757272720336914,
2513
+ "learning_rate": 1.3824467314103494e-06,
2514
+ "loss": 2.3722,
2515
+ "step": 29900
2516
+ },
2517
+ {
2518
+ "epoch": 2.934989972117595,
2519
+ "grad_norm": 16.907358169555664,
2520
+ "learning_rate": 1.201261052326424e-06,
2521
+ "loss": 2.464,
2522
+ "step": 30000
2523
+ },
2524
+ {
2525
+ "epoch": 2.934989972117595,
2526
+ "eval_runtime": 181.9148,
2527
+ "eval_samples_per_second": 112.371,
2528
+ "eval_steps_per_second": 14.051,
2529
+ "step": 30000
2530
+ },
2531
+ {
2532
+ "epoch": 2.944773272024654,
2533
+ "grad_norm": 13.88399600982666,
2534
+ "learning_rate": 1.0200753732424989e-06,
2535
+ "loss": 2.5005,
2536
+ "step": 30100
2537
+ },
2538
+ {
2539
+ "epoch": 2.954556571931713,
2540
+ "grad_norm": 19.77507781982422,
2541
+ "learning_rate": 8.388896941585737e-07,
2542
+ "loss": 2.3829,
2543
+ "step": 30200
2544
+ },
2545
+ {
2546
+ "epoch": 2.9643398718387712,
2547
+ "grad_norm": 16.535932540893555,
2548
+ "learning_rate": 6.577040150746485e-07,
2549
+ "loss": 2.4788,
2550
+ "step": 30300
2551
+ },
2552
+ {
2553
+ "epoch": 2.9741231717458296,
2554
+ "grad_norm": 15.027000427246094,
2555
+ "learning_rate": 4.765183359907233e-07,
2556
+ "loss": 2.5007,
2557
+ "step": 30400
2558
+ },
2559
+ {
2560
+ "epoch": 2.9839064716528885,
2561
+ "grad_norm": 14.9392671585083,
2562
+ "learning_rate": 2.953326569067981e-07,
2563
+ "loss": 2.4847,
2564
+ "step": 30500
2565
+ },
2566
+ {
2567
+ "epoch": 2.9839064716528885,
2568
+ "eval_runtime": 181.9853,
2569
+ "eval_samples_per_second": 112.328,
2570
+ "eval_steps_per_second": 14.045,
2571
+ "step": 30500
2572
+ },
2573
+ {
2574
+ "epoch": 2.9936897715599473,
2575
+ "grad_norm": 15.128337860107422,
2576
+ "learning_rate": 1.1414697782287289e-07,
2577
+ "loss": 2.4209,
2578
+ "step": 30600
2579
+ }
2580
+ ],
2581
+ "logging_steps": 100,
2582
+ "max_steps": 30663,
2583
+ "num_input_tokens_seen": 0,
2584
+ "num_train_epochs": 3,
2585
+ "save_steps": 500,
2586
+ "stateful_callbacks": {
2587
+ "TrainerControl": {
2588
+ "args": {
2589
+ "should_epoch_stop": false,
2590
+ "should_evaluate": false,
2591
+ "should_log": false,
2592
+ "should_save": true,
2593
+ "should_training_stop": true
2594
+ },
2595
+ "attributes": {}
2596
+ }
2597
+ },
2598
+ "total_flos": 1.0701267610290972e+16,
2599
+ "train_batch_size": 8,
2600
+ "trial_name": null,
2601
+ "trial_params": null
2602
+ }