svjack commited on
Commit
ad6cce8
1 Parent(s): 52a461a

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +0 -328
  2. checkpoint-100/README.md +202 -0
  3. checkpoint-100/adapter_config.json +29 -0
  4. checkpoint-100/adapter_model.safetensors +3 -0
  5. checkpoint-100/optimizer.pt +3 -0
  6. checkpoint-100/rng_state.pth +3 -0
  7. checkpoint-100/scheduler.pt +3 -0
  8. checkpoint-100/special_tokens_map.json +24 -0
  9. checkpoint-100/tokenizer.json +0 -0
  10. checkpoint-100/tokenizer.model +3 -0
  11. checkpoint-100/tokenizer_config.json +0 -0
  12. checkpoint-100/trainer_state.json +173 -0
  13. checkpoint-100/training_args.bin +3 -0
  14. checkpoint-1000/README.md +202 -0
  15. checkpoint-1000/adapter_config.json +29 -0
  16. checkpoint-1000/adapter_model.safetensors +3 -0
  17. checkpoint-1000/optimizer.pt +3 -0
  18. checkpoint-1000/rng_state.pth +3 -0
  19. checkpoint-1000/scheduler.pt +3 -0
  20. checkpoint-1000/special_tokens_map.json +24 -0
  21. checkpoint-1000/tokenizer.json +0 -0
  22. checkpoint-1000/tokenizer.model +3 -0
  23. checkpoint-1000/tokenizer_config.json +0 -0
  24. checkpoint-1000/trainer_state.json +1433 -0
  25. checkpoint-1000/training_args.bin +3 -0
  26. checkpoint-1100/README.md +202 -0
  27. checkpoint-1100/adapter_config.json +29 -0
  28. checkpoint-1100/adapter_model.safetensors +3 -0
  29. checkpoint-1100/optimizer.pt +3 -0
  30. checkpoint-1100/rng_state.pth +3 -0
  31. checkpoint-1100/scheduler.pt +3 -0
  32. checkpoint-1100/special_tokens_map.json +24 -0
  33. checkpoint-1100/tokenizer.json +0 -0
  34. checkpoint-1100/tokenizer.model +3 -0
  35. checkpoint-1100/tokenizer_config.json +0 -0
  36. checkpoint-1100/trainer_state.json +1573 -0
  37. checkpoint-1100/training_args.bin +3 -0
  38. checkpoint-1200/README.md +202 -0
  39. checkpoint-1200/adapter_config.json +29 -0
  40. checkpoint-1200/adapter_model.safetensors +3 -0
  41. checkpoint-1200/optimizer.pt +3 -0
  42. checkpoint-1200/rng_state.pth +3 -0
  43. checkpoint-1200/scheduler.pt +3 -0
  44. checkpoint-1200/special_tokens_map.json +24 -0
  45. checkpoint-1200/tokenizer.json +0 -0
  46. checkpoint-1200/tokenizer.model +3 -0
  47. checkpoint-1200/tokenizer_config.json +0 -0
  48. checkpoint-1200/trainer_state.json +1713 -0
  49. checkpoint-1200/training_args.bin +3 -0
  50. checkpoint-1300/README.md +202 -0
README.md CHANGED
@@ -14,334 +14,6 @@ model-index:
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
- # Install
18
- ```bash
19
- pip install peft transformers bitsandbytes ipykernel rapidfuzz
20
- ```
21
- # Run by transformers
22
-
23
- ```python
24
- import json
25
- from dataclasses import dataclass
26
- from enum import Enum
27
- from typing import List, Dict, Tuple, Literal
28
-
29
- class Roles(Enum):
30
- system = "system"
31
- user = "user"
32
- assistant = "assistant"
33
- tool = "tool"
34
-
35
- class MessagesFormatterType(Enum):
36
- """
37
- Enum representing different types of predefined messages formatters.
38
- """
39
-
40
- MISTRAL = 1
41
-
42
- @dataclass
43
- class PromptMarkers:
44
- start: str
45
- end: str
46
-
47
- class MessagesFormatter:
48
- def __init__(
49
- self,
50
- pre_prompt: str,
51
- prompt_markers: Dict[Roles, PromptMarkers],
52
- include_sys_prompt_in_first_user_message: bool,
53
- default_stop_sequences: List[str],
54
- use_user_role_for_function_call_result: bool = True,
55
- strip_prompt: bool = True,
56
- bos_token: str = "<s>",
57
- eos_token: str = "</s>"
58
- ):
59
- self.pre_prompt = pre_prompt
60
- self.prompt_markers = prompt_markers
61
- self.include_sys_prompt_in_first_user_message = include_sys_prompt_in_first_user_message
62
- self.default_stop_sequences = default_stop_sequences
63
- self.use_user_role_for_function_call_result = use_user_role_for_function_call_result
64
- self.strip_prompt = strip_prompt
65
- self.bos_token = bos_token
66
- self.eos_token = eos_token
67
- self.added_system_prompt = False
68
-
69
- def get_bos_token(self) -> str:
70
- return self.bos_token
71
-
72
- def format_conversation(
73
- self,
74
- messages: List[Dict[str, str]],
75
- response_role: Literal[Roles.user, Roles.assistant] | None = None,
76
- ) -> Tuple[str, Roles]:
77
- formatted_messages = self.pre_prompt
78
- last_role = Roles.assistant
79
- self.added_system_prompt = False
80
- for message in messages:
81
- role = Roles(message["role"])
82
- content = self._format_message_content(message["content"], role)
83
-
84
- if role == Roles.system:
85
- formatted_messages += self._format_system_message(content)
86
- last_role = Roles.system
87
- elif role == Roles.user:
88
- formatted_messages += self._format_user_message(content)
89
- last_role = Roles.user
90
- elif role == Roles.assistant:
91
- formatted_messages += self._format_assistant_message(content)
92
- last_role = Roles.assistant
93
- elif role == Roles.tool:
94
- formatted_messages += self._format_tool_message(content)
95
- last_role = Roles.tool
96
-
97
- return self._format_response(formatted_messages, last_role, response_role)
98
-
99
- def _format_message_content(self, content: str, role: Roles) -> str:
100
- if self.strip_prompt:
101
- return content.strip()
102
- return content
103
-
104
- def _format_system_message(self, content: str) -> str:
105
- formatted_message = self.prompt_markers[Roles.system].start + content + self.prompt_markers[Roles.system].end
106
- self.added_system_prompt = True
107
- if self.include_sys_prompt_in_first_user_message:
108
- formatted_message = self.prompt_markers[Roles.user].start + formatted_message
109
- return formatted_message
110
-
111
- def _format_user_message(self, content: str) -> str:
112
- if self.include_sys_prompt_in_first_user_message and self.added_system_prompt:
113
- self.added_system_prompt = False
114
- return content + self.prompt_markers[Roles.user].end
115
- return self.prompt_markers[Roles.user].start + content + self.prompt_markers[Roles.user].end
116
-
117
- def _format_assistant_message(self, content: str) -> str:
118
- return self.prompt_markers[Roles.assistant].start + content + self.prompt_markers[Roles.assistant].end
119
-
120
- def _format_tool_message(self, content: str) -> str:
121
- if isinstance(content, list):
122
- content = "\n".join(json.dumps(m, indent=2) for m in content)
123
- if self.use_user_role_for_function_call_result:
124
- return self._format_user_message(content)
125
- else:
126
- return self.prompt_markers[Roles.tool].start + content + self.prompt_markers[Roles.tool].end
127
-
128
- def _format_response(
129
- self,
130
- formatted_messages: str,
131
- last_role: Roles,
132
- response_role: Literal[Roles.user, Roles.assistant] | None = None,
133
- ) -> Tuple[str, Roles]:
134
- if response_role is None:
135
- response_role = Roles.assistant if last_role != Roles.assistant else Roles.user
136
-
137
- prompt_start = self.prompt_markers[response_role].start.strip() if self.strip_prompt else self.prompt_markers[
138
- response_role].start
139
- return formatted_messages + prompt_start, response_role
140
-
141
- mixtral_prompt_markers = {
142
- Roles.system: PromptMarkers("", """\n\n"""),
143
- Roles.user: PromptMarkers("""[INST] """, """ [/INST]"""),
144
- Roles.assistant: PromptMarkers("""""", """</s>"""),
145
- Roles.tool: PromptMarkers("", ""),
146
- }
147
-
148
- mixtral_formatter = MessagesFormatter(
149
- "",
150
- mixtral_prompt_markers,
151
- True,
152
- ["</s>"],
153
- )
154
-
155
- from transformers import TextStreamer, AutoTokenizer, AutoModelForCausalLM
156
- from peft import PeftModel
157
- tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3",)
158
- mis_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", load_in_4bit = True)
159
- mis_model = PeftModel.from_pretrained(mis_model, "svjack/Genshin_Impact_Mistral_v3_Plot_Chat_roleplay_chat_lora_small")
160
- mis_model = mis_model.eval()
161
-
162
- streamer = TextStreamer(tokenizer)
163
-
164
- def mistral_hf_predict(messages, mis_model = mis_model,
165
- tokenizer = tokenizer, streamer = streamer,
166
- do_sample = True,
167
- top_p = 0.95,
168
- top_k = 40,
169
- max_new_tokens = 512,
170
- max_input_length = 3500,
171
- temperature = 0.9,
172
- repetition_penalty = 1.0,
173
- device = "cuda"):
174
-
175
- #encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
176
- #model_inputs = encodeds.to(device)
177
- prompt, _ = mixtral_formatter.format_conversation(messages)
178
- model_inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
179
-
180
- generated_ids = mis_model.generate(model_inputs, max_new_tokens=max_new_tokens,
181
- do_sample=do_sample,
182
- streamer = streamer,
183
- top_p = top_p,
184
- top_k = top_k,
185
- temperature = temperature,
186
- repetition_penalty = repetition_penalty,
187
- )
188
- out = tokenizer.batch_decode(generated_ids)[0].split("[/INST]")[-1].replace("</s>", "").strip()
189
- return out
190
-
191
- out = mistral_hf_predict([
192
- {
193
- "role": "system",
194
- "content": '''
195
- 故事背景:图书管理员丽莎与助手派蒙在寻找偷书者的冒险中交流,揭示了真相并处理了书籍问题。
196
- 当前故事背景:对话开始时,派蒙对蒙德人的居住习惯发表不当评价,丽莎纠正他并暗示可能是捣乱分子所为,随后讨论了丘丘人不会偷窃和可能性更大的深渊法师。在解开封印后,他们进入遗迹,并决定继续深入调查。
197
- 参与者1:丽莎
198
- 参与者1角色经历:丽莎,作为蒙德城南风之狮庙宇的图书管理员,以其严肃认真的工作态度和对书籍的热爱,与旅行者派蒙共同解决图书丢失的问题。她运用元素感知力帮助找寻线索,与伙伴们互动,展现智慧和勇气,同时对偷书者的行为有着坚定的立场,通过惩罚计划来维护图书的尊严。在游戏中,她不仅提供历史背景,还作为知识库,帮助旅行者理解元素和蒙德的历史,她的存在对解决故事中的谜题和对抗敌人至关重要。在蒙德骑士团中,丽莎也协助凯亚和琴,展现她的团队精神和对守护者的责任感。
199
- 参与者1性格特征:丽莎性格严谨,热爱工作,尊重他人,对待偷书者的行为表现出坚定和公正。她聪明且勇敢,善于使用元素感知力解决问题,同时具有深厚的历史知识和对‘四风守护’的理解。她的智慧和责任感在剧情中起到了关键作用。
200
- 参与者1剧情中的作用:丽莎在剧情中扮演了知识导师和行动伙伴的角色,她的存在丰富了角色设定,通过她的帮助,旅行者得以更深入地理解和应对元素世界。她的出现推动了故事的发展,通过她的智慧和勇气,解决了许多难题,强化了角色间的互动和团队合作。同时,她的责任感和对蒙德的热爱也深化了游戏的主题,体现了对守护者的尊重和对家乡的忠诚。
201
- 参与者2:派蒙
202
- 参与者2角色经历:派蒙是宵宫的旅伴,他们共同面对船的问题,逃离稻妻,与森彦、阿创等人互动,参与烟花制作,解决孩子们的误解。派蒙对宵宫的强硬态度感到惊讶,但理解了沟通的力量。他与旅行者的关系亲密,共享冒险,是故事中的重要角色。他参与了对抗魔物、解决兽境猎犬事件,以及帮助影解决雷电将军的问题,表现出决心和行动力。在与凯亚的冒险中,他们共同解谜寻宝,面对误解,派蒙逐渐理解和接受‘黎明神剑’作为友谊的象征。在‘在此世的星空之外’,他与莫娜一起经历了寻找丢失戒指、解决朱老板问题和烹饪学习的挑战,加深了他们的信任和支持。
203
- 参与者2性格特征:派蒙勇敢、坚韧,对友情有着深刻的理解。他不畏困难,愿意面对挑战,同时也有着对宵宫处理问题方式的惊讶和理解。他���旅行者的关系亲密,显示出忠诚和共享冒险的精神。在与凯亚的互动中,他展现出对友谊的接纳和对占星术的好奇。
204
- 参与者2剧情中的作用:派蒙在剧情中起着多面作用,他不仅是宵宫的伙伴,也是旅行者和凯亚的友人。他的冒险经历丰富了故事的层次,通过他的角色,展示了沟通的力量、友情的重要性,以及面对问题时的决心。他的成长和理解深化了故事的主题,尤其是在处理误解和接受‘黎明神剑’象征友谊的过程中。同时,他与莫娜的合作也展示了角色之间的互动和信任,增加了故事的复杂性和深度。
205
- 要求进行"丽莎"与"派蒙"之间的对话。
206
- 我扮演"丽莎",你扮演"派蒙"。
207
- '''
208
- },
209
- {
210
- "role": "user",
211
- "content": "小派蒙,你如何评价蒙德人的习惯?"
212
- },
213
- {
214
- "role": "assistant",
215
- "content": "我觉得他们太懒散了,总是把东西放在家里,而且家里总是乱七八糟。"
216
- },
217
- {
218
- "role": "user",
219
- "content": "嘘,不要当着他们的面说。"
220
- },
221
- {
222
- "role": "assistant",
223
- "content": "那你认为是谁在偷书?"
224
- },
225
- {
226
- "role": "user",
227
- "content": "你认为可能是丘丘人吗?"
228
- },
229
- {
230
- "role": "assistant",
231
- "content": "丘丘人不会这样吧,他们不会偷东西。"
232
- },
233
- {
234
- "role": "user",
235
- "content": "喜欢暗中计划阴谋的深渊法师可能性比较大。"
236
- },
237
- {
238
- "role": "assistant",
239
- "content": "你们确定那些偷书的人不是从深渊来的吗?"
240
- },
241
- {
242
- "role": "user",
243
- "content": "说不定秘境里就有答案。"
244
- },
245
- ],
246
- repetition_penalty = 1.0,
247
- temperature = 0.5,
248
- max_new_tokens=1024
249
- )
250
- print(out)
251
- ```
252
-
253
- # Output
254
- ```
255
- 那我们进去看看,你准备好了吗?
256
- ```
257
-
258
-
259
- ```python
260
- from rapidfuzz import fuzz
261
- from IPython.display import clear_output
262
- def run_step_infer_times(x, times = 5, temperature = 0.01,
263
- repetition_penalty = 1.0,
264
- sim_val = 70
265
- ):
266
- req = []
267
- for _ in range(times):
268
- clear_output(wait = True)
269
- out = mistral_hf_predict([
270
- {
271
- "role": "system",
272
- "content": ""
273
- },
274
- {
275
- "role": "user",
276
- "content": x
277
- },
278
- ],
279
- repetition_penalty = repetition_penalty,
280
- temperature = temperature,
281
- max_new_tokens = 2070,
282
- max_input_length = 6000,
283
- )
284
- if req:
285
- val = max(map(lambda x: fuzz.ratio(x, out), req))
286
- #print(val)
287
- #print(req)
288
- if val < sim_val:
289
- req.append(out.strip())
290
- x = x.strip() + "\n" + out.strip()
291
- else:
292
- req.append(out.strip())
293
- x = x.strip() + "\n" + out.strip()
294
- return req
295
-
296
- out_l = run_step_infer_times(
297
- '''
298
- 故事标题:为了没有眼泪的明天
299
- 故事背景:旅行者与琴、派蒙在蒙德城中经历了一系列事件,从元素流动回归、处理外交问题到对抗魔龙和寻找解决之道。他们偶遇吟游诗人温迪,后者提供了关于风神与巨龙的关键信息,并提出了借琴解救蒙德的计划。
300
- 参与角色:派蒙、旅行者、琴、丽莎、温迪、歌特琳德
301
- ''',
302
- temperature=0.1,
303
- repetition_penalty = 1.0,
304
- times = 10
305
- )
306
- clear_output(wait = True)
307
-
308
- print("\n".join(out_l))
309
- ```
310
-
311
- # Output
312
- ```
313
- {'参与者1': '派蒙', '参与者2': '旅行者', '当前故事背景': '两人在蒙德城中寻找琴,并在遇到温迪后得知琴可能在城内。'}
314
- {'参与者1': '琴', '参与者2': '丽莎', '当前故事背景': '琴与丽莎交谈,丽莎提出对琴的担忧和对琴的支持,以及对琴的信任和理解。'}
315
- {'参与者1': '温迪', '参与者2': '派蒙', '当前故事背景': '温迪提出借琴解救蒙德的计划,并提供了关于风神与巨龙的信息。'}
316
- {'参与者1': '琴', '参与者2': '温迪', '当前故事背景': '琴对温迪的提议表示理解,并准备接受任务。'}
317
- ```
318
-
319
- ```python
320
- out_l = run_step_infer_times(
321
- '''
322
- 故事标题:归乡
323
- 故事背景:在须弥城门口,派蒙与纳西妲偶遇并帮助一只昏迷的元素生命找寻家园。过程中揭示了这只生物并非普通的蕈兽,而是元素生物,并且它们曾受到过‘末日’的影响,家园被侵蚀。纳西妲回忆起晶体里的力量��能与一个预言有关,为了拯救它们的家园,她必须解决‘禁忌知识’问题,但这个过程对她自身也会产生干扰。
324
- 参与角色:派蒙、纳西妲、浮游水蕈兽、旅行者
325
- ''',
326
- temperature=0.1,
327
- repetition_penalty = 1.0,
328
- times = 10
329
- )
330
- clear_output(wait = True)
331
-
332
- print("\n".join(out_l))
333
- ```
334
-
335
- # Output
336
- ```
337
- {'参与者1': '派蒙', '参与者2': '纳西妲', '当前故事背景': '在须弥城门口,派蒙发现了一个昏迷的浮游水蕈兽,并询问它是否需要帮助。纳西妲注意到这只生物并提出要帮助它们找回家。'}
338
- {'参与者1': '派蒙', '参与者2': '纳西妲', '当前故事背景': '纳西妲解释了这只生物并非普通的蕈兽,而是元素生物,它们的家园被侵蚀,并且晶体里的力量可能与一个预言有关。'}
339
- {'参与者1': '派蒙', '参与者2': '纳西妲', '当前故事背景': '纳西妲提出解决‘禁忌知识’问题,这可能与拯救元素生物的家园有关,但这个过程对她自身也会产生影响。'}
340
- {'参与者1': '派蒙', '参与者2': '纳西妲', '当前故事背景': '派蒙询问‘禁忌知识’的具体内容,纳西妲提出这是为了解决元素生物的问题。'}
341
- {'参与者1': '纳西妲', '参与者2': '旅行者', '当前故事背景': '纳西妲提出解决‘禁忌知识’的问题,旅行者对此表示惊讶。'}
342
- ```
343
-
344
-
345
  # train_2024-05-29-13-19-55
346
 
347
  This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on the plot_genshin_impact_roleplay_agent_vllm, the genshin_impact_plot_engine_step_inst_short_json and the plot_genshin_impact_roleplay_vllm datasets.
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # train_2024-05-29-13-19-55
18
 
19
  This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on the plot_genshin_impact_roleplay_agent_vllm, the genshin_impact_plot_engine_step_inst_short_json and the plot_genshin_impact_roleplay_vllm datasets.
checkpoint-100/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.3
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8810a3e71ecfb5477728078a76d43cd3e90119d3064f5b3bd630adf765600eb0
3
+ size 13648432
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e950fe7d5935b2f67d3a37cc7f2f64ba68216769e31e7e2cd589a926787b3e8
3
+ size 27370618
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
3
+ size 14244
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d4048dab0aa064043ebda2bfc98cac77a60f76880c6e52509dd34cc13216ef1
3
+ size 1064
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-100/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
checkpoint-100/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.0836382645060115,
5
+ "eval_steps": 500,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.004181913225300575,
13
+ "grad_norm": 3.3295910358428955,
14
+ "learning_rate": 4.9999760022374266e-05,
15
+ "loss": 1.8032,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.00836382645060115,
20
+ "grad_norm": 2.002122163772583,
21
+ "learning_rate": 4.999904009410418e-05,
22
+ "loss": 1.5943,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.012545739675901725,
27
+ "grad_norm": 1.9293591976165771,
28
+ "learning_rate": 4.9997840229011085e-05,
29
+ "loss": 1.5378,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.0167276529012023,
34
+ "grad_norm": 1.936969518661499,
35
+ "learning_rate": 4.999616045013025e-05,
36
+ "loss": 1.5856,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.020909566126502875,
41
+ "grad_norm": 1.8237314224243164,
42
+ "learning_rate": 4.9994000789710415e-05,
43
+ "loss": 1.5066,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.02509147935180345,
48
+ "grad_norm": 1.8101543188095093,
49
+ "learning_rate": 4.9991361289213203e-05,
50
+ "loss": 1.4499,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.029273392577104027,
55
+ "grad_norm": 1.647802710533142,
56
+ "learning_rate": 4.998824199931228e-05,
57
+ "loss": 1.4694,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.0334553058024046,
62
+ "grad_norm": 1.7077428102493286,
63
+ "learning_rate": 4.998464297989245e-05,
64
+ "loss": 1.4945,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.03763721902770518,
69
+ "grad_norm": 1.7119005918502808,
70
+ "learning_rate": 4.998056430004844e-05,
71
+ "loss": 1.4257,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.04181913225300575,
76
+ "grad_norm": 2.0229578018188477,
77
+ "learning_rate": 4.997600603808359e-05,
78
+ "loss": 1.4713,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.04600104547830632,
83
+ "grad_norm": 1.885362148284912,
84
+ "learning_rate": 4.997096828150838e-05,
85
+ "loss": 1.5145,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.0501829587036069,
90
+ "grad_norm": 2.0357539653778076,
91
+ "learning_rate": 4.9965451127038714e-05,
92
+ "loss": 1.3811,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.054364871928907474,
97
+ "grad_norm": 1.9780900478363037,
98
+ "learning_rate": 4.9959454680594086e-05,
99
+ "loss": 1.412,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.05854678515420805,
104
+ "grad_norm": 1.6695700883865356,
105
+ "learning_rate": 4.995297905729554e-05,
106
+ "loss": 1.4093,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.06272869837950862,
111
+ "grad_norm": 1.9424223899841309,
112
+ "learning_rate": 4.994602438146344e-05,
113
+ "loss": 1.4615,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.0669106116048092,
118
+ "grad_norm": 1.9082552194595337,
119
+ "learning_rate": 4.9938590786615126e-05,
120
+ "loss": 1.4169,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.07109252483010978,
125
+ "grad_norm": 1.8944685459136963,
126
+ "learning_rate": 4.993067841546231e-05,
127
+ "loss": 1.3064,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.07527443805541036,
132
+ "grad_norm": 1.9688103199005127,
133
+ "learning_rate": 4.992228741990834e-05,
134
+ "loss": 1.3778,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.07945635128071092,
139
+ "grad_norm": 2.0939650535583496,
140
+ "learning_rate": 4.991341796104534e-05,
141
+ "loss": 1.3905,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.0836382645060115,
146
+ "grad_norm": 1.868986964225769,
147
+ "learning_rate": 4.9904070209151015e-05,
148
+ "loss": 1.3723,
149
+ "step": 100
150
+ }
151
+ ],
152
+ "logging_steps": 5,
153
+ "max_steps": 3585,
154
+ "num_input_tokens_seen": 0,
155
+ "num_train_epochs": 3,
156
+ "save_steps": 100,
157
+ "stateful_callbacks": {
158
+ "TrainerControl": {
159
+ "args": {
160
+ "should_epoch_stop": false,
161
+ "should_evaluate": false,
162
+ "should_log": false,
163
+ "should_save": true,
164
+ "should_training_stop": false
165
+ },
166
+ "attributes": {}
167
+ }
168
+ },
169
+ "total_flos": 1.2914497408794624e+17,
170
+ "train_batch_size": 2,
171
+ "trial_name": null,
172
+ "trial_params": null
173
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ad489884f87ec96cc1e49b25622db5c6c3c1eafcad1be5306265a7460b6619a
3
+ size 5304
checkpoint-1000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.3
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55f95e09bf59843911eb652f7d3b822b548c83bf5275af6682e92fdb97d8a6b6
3
+ size 13648432
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1568d2408b6a88982c138d5c10ebf316f5e8fd671f5619115a935449cc7ea9b
3
+ size 27370618
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
3
+ size 14244
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21ad6c58bb3ecc1231a76b4dac6708c1d3dd5b06712d29800b533c8723f9de03
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
checkpoint-1000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.836382645060115,
5
+ "eval_steps": 500,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.004181913225300575,
13
+ "grad_norm": 3.3295910358428955,
14
+ "learning_rate": 4.9999760022374266e-05,
15
+ "loss": 1.8032,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.00836382645060115,
20
+ "grad_norm": 2.002122163772583,
21
+ "learning_rate": 4.999904009410418e-05,
22
+ "loss": 1.5943,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.012545739675901725,
27
+ "grad_norm": 1.9293591976165771,
28
+ "learning_rate": 4.9997840229011085e-05,
29
+ "loss": 1.5378,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.0167276529012023,
34
+ "grad_norm": 1.936969518661499,
35
+ "learning_rate": 4.999616045013025e-05,
36
+ "loss": 1.5856,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.020909566126502875,
41
+ "grad_norm": 1.8237314224243164,
42
+ "learning_rate": 4.9994000789710415e-05,
43
+ "loss": 1.5066,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.02509147935180345,
48
+ "grad_norm": 1.8101543188095093,
49
+ "learning_rate": 4.9991361289213203e-05,
50
+ "loss": 1.4499,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.029273392577104027,
55
+ "grad_norm": 1.647802710533142,
56
+ "learning_rate": 4.998824199931228e-05,
57
+ "loss": 1.4694,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.0334553058024046,
62
+ "grad_norm": 1.7077428102493286,
63
+ "learning_rate": 4.998464297989245e-05,
64
+ "loss": 1.4945,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.03763721902770518,
69
+ "grad_norm": 1.7119005918502808,
70
+ "learning_rate": 4.998056430004844e-05,
71
+ "loss": 1.4257,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.04181913225300575,
76
+ "grad_norm": 2.0229578018188477,
77
+ "learning_rate": 4.997600603808359e-05,
78
+ "loss": 1.4713,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.04600104547830632,
83
+ "grad_norm": 1.885362148284912,
84
+ "learning_rate": 4.997096828150838e-05,
85
+ "loss": 1.5145,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.0501829587036069,
90
+ "grad_norm": 2.0357539653778076,
91
+ "learning_rate": 4.9965451127038714e-05,
92
+ "loss": 1.3811,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.054364871928907474,
97
+ "grad_norm": 1.9780900478363037,
98
+ "learning_rate": 4.9959454680594086e-05,
99
+ "loss": 1.412,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.05854678515420805,
104
+ "grad_norm": 1.6695700883865356,
105
+ "learning_rate": 4.995297905729554e-05,
106
+ "loss": 1.4093,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.06272869837950862,
111
+ "grad_norm": 1.9424223899841309,
112
+ "learning_rate": 4.994602438146344e-05,
113
+ "loss": 1.4615,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.0669106116048092,
118
+ "grad_norm": 1.9082552194595337,
119
+ "learning_rate": 4.9938590786615126e-05,
120
+ "loss": 1.4169,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.07109252483010978,
125
+ "grad_norm": 1.8944685459136963,
126
+ "learning_rate": 4.993067841546231e-05,
127
+ "loss": 1.3064,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.07527443805541036,
132
+ "grad_norm": 1.9688103199005127,
133
+ "learning_rate": 4.992228741990834e-05,
134
+ "loss": 1.3778,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.07945635128071092,
139
+ "grad_norm": 2.0939650535583496,
140
+ "learning_rate": 4.991341796104534e-05,
141
+ "loss": 1.3905,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.0836382645060115,
146
+ "grad_norm": 1.868986964225769,
147
+ "learning_rate": 4.9904070209151015e-05,
148
+ "loss": 1.3723,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 0.08782017773131208,
153
+ "grad_norm": 2.069476842880249,
154
+ "learning_rate": 4.989424434368549e-05,
155
+ "loss": 1.4046,
156
+ "step": 105
157
+ },
158
+ {
159
+ "epoch": 0.09200209095661264,
160
+ "grad_norm": 2.1500356197357178,
161
+ "learning_rate": 4.988394055328779e-05,
162
+ "loss": 1.4319,
163
+ "step": 110
164
+ },
165
+ {
166
+ "epoch": 0.09618400418191322,
167
+ "grad_norm": 1.9786696434020996,
168
+ "learning_rate": 4.987315903577223e-05,
169
+ "loss": 1.4203,
170
+ "step": 115
171
+ },
172
+ {
173
+ "epoch": 0.1003659174072138,
174
+ "grad_norm": 2.035172939300537,
175
+ "learning_rate": 4.986189999812468e-05,
176
+ "loss": 1.4046,
177
+ "step": 120
178
+ },
179
+ {
180
+ "epoch": 0.10454783063251437,
181
+ "grad_norm": 1.9887930154800415,
182
+ "learning_rate": 4.985016365649848e-05,
183
+ "loss": 1.3836,
184
+ "step": 125
185
+ },
186
+ {
187
+ "epoch": 0.10872974385781495,
188
+ "grad_norm": 1.8339554071426392,
189
+ "learning_rate": 4.983795023621041e-05,
190
+ "loss": 1.3468,
191
+ "step": 130
192
+ },
193
+ {
194
+ "epoch": 0.11291165708311553,
195
+ "grad_norm": 2.06648588180542,
196
+ "learning_rate": 4.982525997173625e-05,
197
+ "loss": 1.3481,
198
+ "step": 135
199
+ },
200
+ {
201
+ "epoch": 0.1170935703084161,
202
+ "grad_norm": 2.1179702281951904,
203
+ "learning_rate": 4.9812093106706376e-05,
204
+ "loss": 1.4475,
205
+ "step": 140
206
+ },
207
+ {
208
+ "epoch": 0.12127548353371667,
209
+ "grad_norm": 2.1708436012268066,
210
+ "learning_rate": 4.979844989390104e-05,
211
+ "loss": 1.3662,
212
+ "step": 145
213
+ },
214
+ {
215
+ "epoch": 0.12545739675901724,
216
+ "grad_norm": 2.0533807277679443,
217
+ "learning_rate": 4.978433059524548e-05,
218
+ "loss": 1.3856,
219
+ "step": 150
220
+ },
221
+ {
222
+ "epoch": 0.12963930998431783,
223
+ "grad_norm": 1.9418696165084839,
224
+ "learning_rate": 4.976973548180498e-05,
225
+ "loss": 1.357,
226
+ "step": 155
227
+ },
228
+ {
229
+ "epoch": 0.1338212232096184,
230
+ "grad_norm": 2.1071596145629883,
231
+ "learning_rate": 4.975466483377959e-05,
232
+ "loss": 1.3613,
233
+ "step": 160
234
+ },
235
+ {
236
+ "epoch": 0.138003136434919,
237
+ "grad_norm": 2.3704464435577393,
238
+ "learning_rate": 4.9739118940498766e-05,
239
+ "loss": 1.3985,
240
+ "step": 165
241
+ },
242
+ {
243
+ "epoch": 0.14218504966021955,
244
+ "grad_norm": 2.1726176738739014,
245
+ "learning_rate": 4.9723098100415844e-05,
246
+ "loss": 1.4361,
247
+ "step": 170
248
+ },
249
+ {
250
+ "epoch": 0.14636696288552012,
251
+ "grad_norm": 2.608517646789551,
252
+ "learning_rate": 4.970660262110227e-05,
253
+ "loss": 1.3565,
254
+ "step": 175
255
+ },
256
+ {
257
+ "epoch": 0.1505488761108207,
258
+ "grad_norm": 2.185563564300537,
259
+ "learning_rate": 4.968963281924173e-05,
260
+ "loss": 1.2957,
261
+ "step": 180
262
+ },
263
+ {
264
+ "epoch": 0.15473078933612128,
265
+ "grad_norm": 1.9632441997528076,
266
+ "learning_rate": 4.967218902062403e-05,
267
+ "loss": 1.2833,
268
+ "step": 185
269
+ },
270
+ {
271
+ "epoch": 0.15891270256142184,
272
+ "grad_norm": 2.303462028503418,
273
+ "learning_rate": 4.96542715601389e-05,
274
+ "loss": 1.3238,
275
+ "step": 190
276
+ },
277
+ {
278
+ "epoch": 0.16309461578672244,
279
+ "grad_norm": 2.3211796283721924,
280
+ "learning_rate": 4.9635880781769495e-05,
281
+ "loss": 1.3848,
282
+ "step": 195
283
+ },
284
+ {
285
+ "epoch": 0.167276529012023,
286
+ "grad_norm": 2.0993776321411133,
287
+ "learning_rate": 4.961701703858584e-05,
288
+ "loss": 1.3997,
289
+ "step": 200
290
+ },
291
+ {
292
+ "epoch": 0.17145844223732357,
293
+ "grad_norm": 2.210942506790161,
294
+ "learning_rate": 4.9597680692738056e-05,
295
+ "loss": 1.334,
296
+ "step": 205
297
+ },
298
+ {
299
+ "epoch": 0.17564035546262416,
300
+ "grad_norm": 2.1277225017547607,
301
+ "learning_rate": 4.957787211544935e-05,
302
+ "loss": 1.3338,
303
+ "step": 210
304
+ },
305
+ {
306
+ "epoch": 0.17982226868792472,
307
+ "grad_norm": 2.287722587585449,
308
+ "learning_rate": 4.9557591687008966e-05,
309
+ "loss": 1.3621,
310
+ "step": 215
311
+ },
312
+ {
313
+ "epoch": 0.1840041819132253,
314
+ "grad_norm": 2.113842010498047,
315
+ "learning_rate": 4.9536839796764825e-05,
316
+ "loss": 1.3808,
317
+ "step": 220
318
+ },
319
+ {
320
+ "epoch": 0.18818609513852588,
321
+ "grad_norm": 2.0642735958099365,
322
+ "learning_rate": 4.951561684311608e-05,
323
+ "loss": 1.3429,
324
+ "step": 225
325
+ },
326
+ {
327
+ "epoch": 0.19236800836382645,
328
+ "grad_norm": 2.2026379108428955,
329
+ "learning_rate": 4.9493923233505435e-05,
330
+ "loss": 1.2855,
331
+ "step": 230
332
+ },
333
+ {
334
+ "epoch": 0.196549921589127,
335
+ "grad_norm": 2.2843308448791504,
336
+ "learning_rate": 4.947175938441138e-05,
337
+ "loss": 1.3432,
338
+ "step": 235
339
+ },
340
+ {
341
+ "epoch": 0.2007318348144276,
342
+ "grad_norm": 2.126749038696289,
343
+ "learning_rate": 4.9449125721340145e-05,
344
+ "loss": 1.3753,
345
+ "step": 240
346
+ },
347
+ {
348
+ "epoch": 0.20491374803972817,
349
+ "grad_norm": 2.514599561691284,
350
+ "learning_rate": 4.942602267881755e-05,
351
+ "loss": 1.3101,
352
+ "step": 245
353
+ },
354
+ {
355
+ "epoch": 0.20909566126502874,
356
+ "grad_norm": 2.2043542861938477,
357
+ "learning_rate": 4.940245070038064e-05,
358
+ "loss": 1.3395,
359
+ "step": 250
360
+ },
361
+ {
362
+ "epoch": 0.21327757449032933,
363
+ "grad_norm": 2.152742862701416,
364
+ "learning_rate": 4.937841023856923e-05,
365
+ "loss": 1.3188,
366
+ "step": 255
367
+ },
368
+ {
369
+ "epoch": 0.2174594877156299,
370
+ "grad_norm": 2.221057415008545,
371
+ "learning_rate": 4.935390175491716e-05,
372
+ "loss": 1.4205,
373
+ "step": 260
374
+ },
375
+ {
376
+ "epoch": 0.2216414009409305,
377
+ "grad_norm": 2.600109100341797,
378
+ "learning_rate": 4.932892571994342e-05,
379
+ "loss": 1.3499,
380
+ "step": 265
381
+ },
382
+ {
383
+ "epoch": 0.22582331416623105,
384
+ "grad_norm": 2.2291276454925537,
385
+ "learning_rate": 4.9303482613143194e-05,
386
+ "loss": 1.3984,
387
+ "step": 270
388
+ },
389
+ {
390
+ "epoch": 0.23000522739153162,
391
+ "grad_norm": 2.338000774383545,
392
+ "learning_rate": 4.9277572922978586e-05,
393
+ "loss": 1.344,
394
+ "step": 275
395
+ },
396
+ {
397
+ "epoch": 0.2341871406168322,
398
+ "grad_norm": 2.3423988819122314,
399
+ "learning_rate": 4.925119714686928e-05,
400
+ "loss": 1.2696,
401
+ "step": 280
402
+ },
403
+ {
404
+ "epoch": 0.23836905384213278,
405
+ "grad_norm": 2.433870553970337,
406
+ "learning_rate": 4.9224355791182955e-05,
407
+ "loss": 1.3903,
408
+ "step": 285
409
+ },
410
+ {
411
+ "epoch": 0.24255096706743334,
412
+ "grad_norm": 2.281836986541748,
413
+ "learning_rate": 4.919704937122559e-05,
414
+ "loss": 1.3694,
415
+ "step": 290
416
+ },
417
+ {
418
+ "epoch": 0.24673288029273394,
419
+ "grad_norm": 2.2429168224334717,
420
+ "learning_rate": 4.916927841123159e-05,
421
+ "loss": 1.3541,
422
+ "step": 295
423
+ },
424
+ {
425
+ "epoch": 0.2509147935180345,
426
+ "grad_norm": 2.4559881687164307,
427
+ "learning_rate": 4.9141043444353674e-05,
428
+ "loss": 1.3795,
429
+ "step": 300
430
+ },
431
+ {
432
+ "epoch": 0.25509670674333507,
433
+ "grad_norm": 2.7529919147491455,
434
+ "learning_rate": 4.911234501265266e-05,
435
+ "loss": 1.386,
436
+ "step": 305
437
+ },
438
+ {
439
+ "epoch": 0.25927861996863566,
440
+ "grad_norm": 2.229973554611206,
441
+ "learning_rate": 4.9083183667087064e-05,
442
+ "loss": 1.3653,
443
+ "step": 310
444
+ },
445
+ {
446
+ "epoch": 0.26346053319393625,
447
+ "grad_norm": 2.38677716255188,
448
+ "learning_rate": 4.9053559967502535e-05,
449
+ "loss": 1.3254,
450
+ "step": 315
451
+ },
452
+ {
453
+ "epoch": 0.2676424464192368,
454
+ "grad_norm": 2.292898416519165,
455
+ "learning_rate": 4.9023474482621075e-05,
456
+ "loss": 1.3756,
457
+ "step": 320
458
+ },
459
+ {
460
+ "epoch": 0.2718243596445374,
461
+ "grad_norm": 2.08819317817688,
462
+ "learning_rate": 4.899292779003014e-05,
463
+ "loss": 1.3286,
464
+ "step": 325
465
+ },
466
+ {
467
+ "epoch": 0.276006272869838,
468
+ "grad_norm": 2.1810929775238037,
469
+ "learning_rate": 4.896192047617156e-05,
470
+ "loss": 1.3884,
471
+ "step": 330
472
+ },
473
+ {
474
+ "epoch": 0.2801881860951385,
475
+ "grad_norm": 2.2136948108673096,
476
+ "learning_rate": 4.893045313633025e-05,
477
+ "loss": 1.2723,
478
+ "step": 335
479
+ },
480
+ {
481
+ "epoch": 0.2843700993204391,
482
+ "grad_norm": 2.3611645698547363,
483
+ "learning_rate": 4.8898526374622815e-05,
484
+ "loss": 1.3758,
485
+ "step": 340
486
+ },
487
+ {
488
+ "epoch": 0.2885520125457397,
489
+ "grad_norm": 2.2939200401306152,
490
+ "learning_rate": 4.886614080398594e-05,
491
+ "loss": 1.3727,
492
+ "step": 345
493
+ },
494
+ {
495
+ "epoch": 0.29273392577104024,
496
+ "grad_norm": 2.2382090091705322,
497
+ "learning_rate": 4.8833297046164594e-05,
498
+ "loss": 1.3412,
499
+ "step": 350
500
+ },
501
+ {
502
+ "epoch": 0.29691583899634083,
503
+ "grad_norm": 2.0420024394989014,
504
+ "learning_rate": 4.8799995731700155e-05,
505
+ "loss": 1.3378,
506
+ "step": 355
507
+ },
508
+ {
509
+ "epoch": 0.3010977522216414,
510
+ "grad_norm": 2.180814266204834,
511
+ "learning_rate": 4.8766237499918244e-05,
512
+ "loss": 1.3305,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 0.30527966544694196,
517
+ "grad_norm": 2.641951560974121,
518
+ "learning_rate": 4.873202299891649e-05,
519
+ "loss": 1.3084,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 0.30946157867224255,
524
+ "grad_norm": 2.421492099761963,
525
+ "learning_rate": 4.8697352885552077e-05,
526
+ "loss": 1.3321,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 0.31364349189754315,
531
+ "grad_norm": 2.1777994632720947,
532
+ "learning_rate": 4.866222782542912e-05,
533
+ "loss": 1.3605,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 0.3178254051228437,
538
+ "grad_norm": 2.097909450531006,
539
+ "learning_rate": 4.862664849288589e-05,
540
+ "loss": 1.3786,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 0.3220073183481443,
545
+ "grad_norm": 2.3216776847839355,
546
+ "learning_rate": 4.8590615570981904e-05,
547
+ "loss": 1.3467,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 0.32618923157344487,
552
+ "grad_norm": 2.199545383453369,
553
+ "learning_rate": 4.855412975148475e-05,
554
+ "loss": 1.3534,
555
+ "step": 390
556
+ },
557
+ {
558
+ "epoch": 0.3303711447987454,
559
+ "grad_norm": 2.4035983085632324,
560
+ "learning_rate": 4.851719173485686e-05,
561
+ "loss": 1.3545,
562
+ "step": 395
563
+ },
564
+ {
565
+ "epoch": 0.334553058024046,
566
+ "grad_norm": 2.130056142807007,
567
+ "learning_rate": 4.847980223024205e-05,
568
+ "loss": 1.3574,
569
+ "step": 400
570
+ },
571
+ {
572
+ "epoch": 0.3387349712493466,
573
+ "grad_norm": 2.2835471630096436,
574
+ "learning_rate": 4.8441961955451865e-05,
575
+ "loss": 1.2835,
576
+ "step": 405
577
+ },
578
+ {
579
+ "epoch": 0.34291688447464713,
580
+ "grad_norm": 2.129136085510254,
581
+ "learning_rate": 4.840367163695186e-05,
582
+ "loss": 1.3066,
583
+ "step": 410
584
+ },
585
+ {
586
+ "epoch": 0.3470987976999477,
587
+ "grad_norm": 2.199307680130005,
588
+ "learning_rate": 4.8364932009847614e-05,
589
+ "loss": 1.3891,
590
+ "step": 415
591
+ },
592
+ {
593
+ "epoch": 0.3512807109252483,
594
+ "grad_norm": 2.240877866744995,
595
+ "learning_rate": 4.8325743817870614e-05,
596
+ "loss": 1.3131,
597
+ "step": 420
598
+ },
599
+ {
600
+ "epoch": 0.35546262415054886,
601
+ "grad_norm": 2.4342916011810303,
602
+ "learning_rate": 4.8286107813364015e-05,
603
+ "loss": 1.2903,
604
+ "step": 425
605
+ },
606
+ {
607
+ "epoch": 0.35964453737584945,
608
+ "grad_norm": 2.323018789291382,
609
+ "learning_rate": 4.824602475726815e-05,
610
+ "loss": 1.3016,
611
+ "step": 430
612
+ },
613
+ {
614
+ "epoch": 0.36382645060115004,
615
+ "grad_norm": 2.2159998416900635,
616
+ "learning_rate": 4.820549541910595e-05,
617
+ "loss": 1.3312,
618
+ "step": 435
619
+ },
620
+ {
621
+ "epoch": 0.3680083638264506,
622
+ "grad_norm": 2.654232978820801,
623
+ "learning_rate": 4.8164520576968165e-05,
624
+ "loss": 1.3793,
625
+ "step": 440
626
+ },
627
+ {
628
+ "epoch": 0.3721902770517512,
629
+ "grad_norm": 2.3524513244628906,
630
+ "learning_rate": 4.8123101017498416e-05,
631
+ "loss": 1.3406,
632
+ "step": 445
633
+ },
634
+ {
635
+ "epoch": 0.37637219027705177,
636
+ "grad_norm": 2.3215346336364746,
637
+ "learning_rate": 4.8081237535878116e-05,
638
+ "loss": 1.3997,
639
+ "step": 450
640
+ },
641
+ {
642
+ "epoch": 0.3805541035023523,
643
+ "grad_norm": 2.0394012928009033,
644
+ "learning_rate": 4.803893093581117e-05,
645
+ "loss": 1.3084,
646
+ "step": 455
647
+ },
648
+ {
649
+ "epoch": 0.3847360167276529,
650
+ "grad_norm": 2.251598596572876,
651
+ "learning_rate": 4.799618202950857e-05,
652
+ "loss": 1.3396,
653
+ "step": 460
654
+ },
655
+ {
656
+ "epoch": 0.3889179299529535,
657
+ "grad_norm": 2.4240410327911377,
658
+ "learning_rate": 4.795299163767282e-05,
659
+ "loss": 1.359,
660
+ "step": 465
661
+ },
662
+ {
663
+ "epoch": 0.393099843178254,
664
+ "grad_norm": 2.2180769443511963,
665
+ "learning_rate": 4.790936058948211e-05,
666
+ "loss": 1.3575,
667
+ "step": 470
668
+ },
669
+ {
670
+ "epoch": 0.3972817564035546,
671
+ "grad_norm": 2.2782742977142334,
672
+ "learning_rate": 4.786528972257449e-05,
673
+ "loss": 1.2915,
674
+ "step": 475
675
+ },
676
+ {
677
+ "epoch": 0.4014636696288552,
678
+ "grad_norm": 2.2193851470947266,
679
+ "learning_rate": 4.7820779883031696e-05,
680
+ "loss": 1.3185,
681
+ "step": 480
682
+ },
683
+ {
684
+ "epoch": 0.40564558285415575,
685
+ "grad_norm": 2.2422704696655273,
686
+ "learning_rate": 4.7775831925363e-05,
687
+ "loss": 1.2638,
688
+ "step": 485
689
+ },
690
+ {
691
+ "epoch": 0.40982749607945634,
692
+ "grad_norm": 2.430154323577881,
693
+ "learning_rate": 4.773044671248872e-05,
694
+ "loss": 1.3616,
695
+ "step": 490
696
+ },
697
+ {
698
+ "epoch": 0.41400940930475694,
699
+ "grad_norm": 2.2115910053253174,
700
+ "learning_rate": 4.768462511572371e-05,
701
+ "loss": 1.35,
702
+ "step": 495
703
+ },
704
+ {
705
+ "epoch": 0.4181913225300575,
706
+ "grad_norm": 2.063385486602783,
707
+ "learning_rate": 4.763836801476061e-05,
708
+ "loss": 1.2797,
709
+ "step": 500
710
+ },
711
+ {
712
+ "epoch": 0.42237323575535807,
713
+ "grad_norm": 2.3508410453796387,
714
+ "learning_rate": 4.759167629765297e-05,
715
+ "loss": 1.3173,
716
+ "step": 505
717
+ },
718
+ {
719
+ "epoch": 0.42655514898065866,
720
+ "grad_norm": 2.3848047256469727,
721
+ "learning_rate": 4.7544550860798177e-05,
722
+ "loss": 1.2502,
723
+ "step": 510
724
+ },
725
+ {
726
+ "epoch": 0.43073706220595925,
727
+ "grad_norm": 2.227524757385254,
728
+ "learning_rate": 4.749699260892026e-05,
729
+ "loss": 1.3174,
730
+ "step": 515
731
+ },
732
+ {
733
+ "epoch": 0.4349189754312598,
734
+ "grad_norm": 2.2080676555633545,
735
+ "learning_rate": 4.744900245505253e-05,
736
+ "loss": 1.2567,
737
+ "step": 520
738
+ },
739
+ {
740
+ "epoch": 0.4391008886565604,
741
+ "grad_norm": 2.2167842388153076,
742
+ "learning_rate": 4.7400581320520055e-05,
743
+ "loss": 1.2906,
744
+ "step": 525
745
+ },
746
+ {
747
+ "epoch": 0.443282801881861,
748
+ "grad_norm": 2.4347431659698486,
749
+ "learning_rate": 4.735173013492193e-05,
750
+ "loss": 1.3109,
751
+ "step": 530
752
+ },
753
+ {
754
+ "epoch": 0.4474647151071615,
755
+ "grad_norm": 2.024902105331421,
756
+ "learning_rate": 4.73024498361135e-05,
757
+ "loss": 1.3265,
758
+ "step": 535
759
+ },
760
+ {
761
+ "epoch": 0.4516466283324621,
762
+ "grad_norm": 2.1333351135253906,
763
+ "learning_rate": 4.725274137018826e-05,
764
+ "loss": 1.3426,
765
+ "step": 540
766
+ },
767
+ {
768
+ "epoch": 0.4558285415577627,
769
+ "grad_norm": 2.400423049926758,
770
+ "learning_rate": 4.720260569145981e-05,
771
+ "loss": 1.2751,
772
+ "step": 545
773
+ },
774
+ {
775
+ "epoch": 0.46001045478306324,
776
+ "grad_norm": 2.5796918869018555,
777
+ "learning_rate": 4.715204376244343e-05,
778
+ "loss": 1.3453,
779
+ "step": 550
780
+ },
781
+ {
782
+ "epoch": 0.46419236800836383,
783
+ "grad_norm": 2.339895486831665,
784
+ "learning_rate": 4.7101056553837665e-05,
785
+ "loss": 1.2667,
786
+ "step": 555
787
+ },
788
+ {
789
+ "epoch": 0.4683742812336644,
790
+ "grad_norm": 2.273618221282959,
791
+ "learning_rate": 4.704964504450563e-05,
792
+ "loss": 1.3264,
793
+ "step": 560
794
+ },
795
+ {
796
+ "epoch": 0.47255619445896496,
797
+ "grad_norm": 1.9942044019699097,
798
+ "learning_rate": 4.69978102214563e-05,
799
+ "loss": 1.2907,
800
+ "step": 565
801
+ },
802
+ {
803
+ "epoch": 0.47673810768426556,
804
+ "grad_norm": 2.1848580837249756,
805
+ "learning_rate": 4.694555307982551e-05,
806
+ "loss": 1.2921,
807
+ "step": 570
808
+ },
809
+ {
810
+ "epoch": 0.48092002090956615,
811
+ "grad_norm": 2.2914299964904785,
812
+ "learning_rate": 4.689287462285681e-05,
813
+ "loss": 1.3612,
814
+ "step": 575
815
+ },
816
+ {
817
+ "epoch": 0.4851019341348667,
818
+ "grad_norm": 2.1165218353271484,
819
+ "learning_rate": 4.6839775861882306e-05,
820
+ "loss": 1.3419,
821
+ "step": 580
822
+ },
823
+ {
824
+ "epoch": 0.4892838473601673,
825
+ "grad_norm": 2.003962516784668,
826
+ "learning_rate": 4.678625781630315e-05,
827
+ "loss": 1.388,
828
+ "step": 585
829
+ },
830
+ {
831
+ "epoch": 0.49346576058546787,
832
+ "grad_norm": 2.366339683532715,
833
+ "learning_rate": 4.673232151357004e-05,
834
+ "loss": 1.2895,
835
+ "step": 590
836
+ },
837
+ {
838
+ "epoch": 0.4976476738107684,
839
+ "grad_norm": 2.4352052211761475,
840
+ "learning_rate": 4.667796798916343e-05,
841
+ "loss": 1.3457,
842
+ "step": 595
843
+ },
844
+ {
845
+ "epoch": 0.501829587036069,
846
+ "grad_norm": 2.4221272468566895,
847
+ "learning_rate": 4.662319828657371e-05,
848
+ "loss": 1.3298,
849
+ "step": 600
850
+ },
851
+ {
852
+ "epoch": 0.5060115002613695,
853
+ "grad_norm": 2.2551608085632324,
854
+ "learning_rate": 4.6568013457281126e-05,
855
+ "loss": 1.2192,
856
+ "step": 605
857
+ },
858
+ {
859
+ "epoch": 0.5101934134866701,
860
+ "grad_norm": 2.4560656547546387,
861
+ "learning_rate": 4.651241456073563e-05,
862
+ "loss": 1.3086,
863
+ "step": 610
864
+ },
865
+ {
866
+ "epoch": 0.5143753267119707,
867
+ "grad_norm": 2.266500473022461,
868
+ "learning_rate": 4.645640266433651e-05,
869
+ "loss": 1.2608,
870
+ "step": 615
871
+ },
872
+ {
873
+ "epoch": 0.5185572399372713,
874
+ "grad_norm": 2.175009250640869,
875
+ "learning_rate": 4.639997884341192e-05,
876
+ "loss": 1.3459,
877
+ "step": 620
878
+ },
879
+ {
880
+ "epoch": 0.5227391531625719,
881
+ "grad_norm": 2.329770803451538,
882
+ "learning_rate": 4.634314418119823e-05,
883
+ "loss": 1.2894,
884
+ "step": 625
885
+ },
886
+ {
887
+ "epoch": 0.5269210663878725,
888
+ "grad_norm": 2.545764446258545,
889
+ "learning_rate": 4.628589976881923e-05,
890
+ "loss": 1.3213,
891
+ "step": 630
892
+ },
893
+ {
894
+ "epoch": 0.531102979613173,
895
+ "grad_norm": 2.2759323120117188,
896
+ "learning_rate": 4.622824670526516e-05,
897
+ "loss": 1.3362,
898
+ "step": 635
899
+ },
900
+ {
901
+ "epoch": 0.5352848928384736,
902
+ "grad_norm": 2.341280221939087,
903
+ "learning_rate": 4.617018609737166e-05,
904
+ "loss": 1.24,
905
+ "step": 640
906
+ },
907
+ {
908
+ "epoch": 0.5394668060637742,
909
+ "grad_norm": 2.353301763534546,
910
+ "learning_rate": 4.6111719059798466e-05,
911
+ "loss": 1.3643,
912
+ "step": 645
913
+ },
914
+ {
915
+ "epoch": 0.5436487192890748,
916
+ "grad_norm": 2.223389148712158,
917
+ "learning_rate": 4.605284671500805e-05,
918
+ "loss": 1.3569,
919
+ "step": 650
920
+ },
921
+ {
922
+ "epoch": 0.5478306325143754,
923
+ "grad_norm": 2.2959463596343994,
924
+ "learning_rate": 4.599357019324405e-05,
925
+ "loss": 1.3581,
926
+ "step": 655
927
+ },
928
+ {
929
+ "epoch": 0.552012545739676,
930
+ "grad_norm": 2.1995513439178467,
931
+ "learning_rate": 4.593389063250958e-05,
932
+ "loss": 1.2962,
933
+ "step": 660
934
+ },
935
+ {
936
+ "epoch": 0.5561944589649764,
937
+ "grad_norm": 1.9802824258804321,
938
+ "learning_rate": 4.5873809178545396e-05,
939
+ "loss": 1.3121,
940
+ "step": 665
941
+ },
942
+ {
943
+ "epoch": 0.560376372190277,
944
+ "grad_norm": 2.2561113834381104,
945
+ "learning_rate": 4.581332698480786e-05,
946
+ "loss": 1.2395,
947
+ "step": 670
948
+ },
949
+ {
950
+ "epoch": 0.5645582854155776,
951
+ "grad_norm": 2.1277997493743896,
952
+ "learning_rate": 4.5752445212446836e-05,
953
+ "loss": 1.2782,
954
+ "step": 675
955
+ },
956
+ {
957
+ "epoch": 0.5687401986408782,
958
+ "grad_norm": 2.1406946182250977,
959
+ "learning_rate": 4.569116503028339e-05,
960
+ "loss": 1.3248,
961
+ "step": 680
962
+ },
963
+ {
964
+ "epoch": 0.5729221118661788,
965
+ "grad_norm": 2.190600872039795,
966
+ "learning_rate": 4.5629487614787306e-05,
967
+ "loss": 1.2975,
968
+ "step": 685
969
+ },
970
+ {
971
+ "epoch": 0.5771040250914794,
972
+ "grad_norm": 2.1068780422210693,
973
+ "learning_rate": 4.556741415005459e-05,
974
+ "loss": 1.3191,
975
+ "step": 690
976
+ },
977
+ {
978
+ "epoch": 0.5812859383167799,
979
+ "grad_norm": 2.3496341705322266,
980
+ "learning_rate": 4.5504945827784634e-05,
981
+ "loss": 1.2827,
982
+ "step": 695
983
+ },
984
+ {
985
+ "epoch": 0.5854678515420805,
986
+ "grad_norm": 2.33406400680542,
987
+ "learning_rate": 4.544208384725742e-05,
988
+ "loss": 1.2649,
989
+ "step": 700
990
+ },
991
+ {
992
+ "epoch": 0.5896497647673811,
993
+ "grad_norm": 2.365461587905884,
994
+ "learning_rate": 4.5378829415310465e-05,
995
+ "loss": 1.3121,
996
+ "step": 705
997
+ },
998
+ {
999
+ "epoch": 0.5938316779926817,
1000
+ "grad_norm": 2.186875343322754,
1001
+ "learning_rate": 4.531518374631564e-05,
1002
+ "loss": 1.3113,
1003
+ "step": 710
1004
+ },
1005
+ {
1006
+ "epoch": 0.5980135912179823,
1007
+ "grad_norm": 2.1798176765441895,
1008
+ "learning_rate": 4.525114806215584e-05,
1009
+ "loss": 1.3446,
1010
+ "step": 715
1011
+ },
1012
+ {
1013
+ "epoch": 0.6021955044432828,
1014
+ "grad_norm": 1.9714925289154053,
1015
+ "learning_rate": 4.518672359220161e-05,
1016
+ "loss": 1.2979,
1017
+ "step": 720
1018
+ },
1019
+ {
1020
+ "epoch": 0.6063774176685833,
1021
+ "grad_norm": 2.1850643157958984,
1022
+ "learning_rate": 4.5121911573287446e-05,
1023
+ "loss": 1.2412,
1024
+ "step": 725
1025
+ },
1026
+ {
1027
+ "epoch": 0.6105593308938839,
1028
+ "grad_norm": 2.0708961486816406,
1029
+ "learning_rate": 4.505671324968811e-05,
1030
+ "loss": 1.2559,
1031
+ "step": 730
1032
+ },
1033
+ {
1034
+ "epoch": 0.6147412441191845,
1035
+ "grad_norm": 2.461225986480713,
1036
+ "learning_rate": 4.49911298730947e-05,
1037
+ "loss": 1.3696,
1038
+ "step": 735
1039
+ },
1040
+ {
1041
+ "epoch": 0.6189231573444851,
1042
+ "grad_norm": 2.2155981063842773,
1043
+ "learning_rate": 4.492516270259066e-05,
1044
+ "loss": 1.3459,
1045
+ "step": 740
1046
+ },
1047
+ {
1048
+ "epoch": 0.6231050705697857,
1049
+ "grad_norm": 2.0899972915649414,
1050
+ "learning_rate": 4.48588130046276e-05,
1051
+ "loss": 1.2885,
1052
+ "step": 745
1053
+ },
1054
+ {
1055
+ "epoch": 0.6272869837950863,
1056
+ "grad_norm": 2.2798969745635986,
1057
+ "learning_rate": 4.479208205300094e-05,
1058
+ "loss": 1.299,
1059
+ "step": 750
1060
+ },
1061
+ {
1062
+ "epoch": 0.6314688970203868,
1063
+ "grad_norm": 2.4617793560028076,
1064
+ "learning_rate": 4.472497112882552e-05,
1065
+ "loss": 1.2973,
1066
+ "step": 755
1067
+ },
1068
+ {
1069
+ "epoch": 0.6356508102456874,
1070
+ "grad_norm": 2.20271635055542,
1071
+ "learning_rate": 4.465748152051096e-05,
1072
+ "loss": 1.231,
1073
+ "step": 760
1074
+ },
1075
+ {
1076
+ "epoch": 0.639832723470988,
1077
+ "grad_norm": 1.990775227546692,
1078
+ "learning_rate": 4.458961452373692e-05,
1079
+ "loss": 1.2594,
1080
+ "step": 765
1081
+ },
1082
+ {
1083
+ "epoch": 0.6440146366962886,
1084
+ "grad_norm": 2.2898495197296143,
1085
+ "learning_rate": 4.4521371441428284e-05,
1086
+ "loss": 1.2721,
1087
+ "step": 770
1088
+ },
1089
+ {
1090
+ "epoch": 0.6481965499215891,
1091
+ "grad_norm": 2.282844066619873,
1092
+ "learning_rate": 4.445275358373006e-05,
1093
+ "loss": 1.2742,
1094
+ "step": 775
1095
+ },
1096
+ {
1097
+ "epoch": 0.6523784631468897,
1098
+ "grad_norm": 2.3958911895751953,
1099
+ "learning_rate": 4.438376226798231e-05,
1100
+ "loss": 1.2897,
1101
+ "step": 780
1102
+ },
1103
+ {
1104
+ "epoch": 0.6565603763721902,
1105
+ "grad_norm": 2.2609405517578125,
1106
+ "learning_rate": 4.43143988186948e-05,
1107
+ "loss": 1.2891,
1108
+ "step": 785
1109
+ },
1110
+ {
1111
+ "epoch": 0.6607422895974908,
1112
+ "grad_norm": 2.416956663131714,
1113
+ "learning_rate": 4.42446645675216e-05,
1114
+ "loss": 1.3711,
1115
+ "step": 790
1116
+ },
1117
+ {
1118
+ "epoch": 0.6649242028227914,
1119
+ "grad_norm": 2.280547857284546,
1120
+ "learning_rate": 4.4174560853235505e-05,
1121
+ "loss": 1.3413,
1122
+ "step": 795
1123
+ },
1124
+ {
1125
+ "epoch": 0.669106116048092,
1126
+ "grad_norm": 2.055990695953369,
1127
+ "learning_rate": 4.410408902170235e-05,
1128
+ "loss": 1.2829,
1129
+ "step": 800
1130
+ },
1131
+ {
1132
+ "epoch": 0.6732880292733926,
1133
+ "grad_norm": 2.1436378955841064,
1134
+ "learning_rate": 4.403325042585518e-05,
1135
+ "loss": 1.2223,
1136
+ "step": 805
1137
+ },
1138
+ {
1139
+ "epoch": 0.6774699424986932,
1140
+ "grad_norm": 2.1957428455352783,
1141
+ "learning_rate": 4.396204642566821e-05,
1142
+ "loss": 1.2923,
1143
+ "step": 810
1144
+ },
1145
+ {
1146
+ "epoch": 0.6816518557239938,
1147
+ "grad_norm": 2.1543030738830566,
1148
+ "learning_rate": 4.389047838813082e-05,
1149
+ "loss": 1.2296,
1150
+ "step": 815
1151
+ },
1152
+ {
1153
+ "epoch": 0.6858337689492943,
1154
+ "grad_norm": 2.1455068588256836,
1155
+ "learning_rate": 4.3818547687221204e-05,
1156
+ "loss": 1.2913,
1157
+ "step": 820
1158
+ },
1159
+ {
1160
+ "epoch": 0.6900156821745949,
1161
+ "grad_norm": 2.2354376316070557,
1162
+ "learning_rate": 4.374625570388008e-05,
1163
+ "loss": 1.2647,
1164
+ "step": 825
1165
+ },
1166
+ {
1167
+ "epoch": 0.6941975953998955,
1168
+ "grad_norm": 2.2896392345428467,
1169
+ "learning_rate": 4.367360382598413e-05,
1170
+ "loss": 1.329,
1171
+ "step": 830
1172
+ },
1173
+ {
1174
+ "epoch": 0.698379508625196,
1175
+ "grad_norm": 2.0567495822906494,
1176
+ "learning_rate": 4.360059344831936e-05,
1177
+ "loss": 1.2306,
1178
+ "step": 835
1179
+ },
1180
+ {
1181
+ "epoch": 0.7025614218504966,
1182
+ "grad_norm": 2.0631346702575684,
1183
+ "learning_rate": 4.352722597255434e-05,
1184
+ "loss": 1.2788,
1185
+ "step": 840
1186
+ },
1187
+ {
1188
+ "epoch": 0.7067433350757972,
1189
+ "grad_norm": 2.3736605644226074,
1190
+ "learning_rate": 4.345350280721328e-05,
1191
+ "loss": 1.3236,
1192
+ "step": 845
1193
+ },
1194
+ {
1195
+ "epoch": 0.7109252483010977,
1196
+ "grad_norm": 2.308960437774658,
1197
+ "learning_rate": 4.337942536764901e-05,
1198
+ "loss": 1.2993,
1199
+ "step": 850
1200
+ },
1201
+ {
1202
+ "epoch": 0.7151071615263983,
1203
+ "grad_norm": 2.230039358139038,
1204
+ "learning_rate": 4.330499507601575e-05,
1205
+ "loss": 1.2544,
1206
+ "step": 855
1207
+ },
1208
+ {
1209
+ "epoch": 0.7192890747516989,
1210
+ "grad_norm": 2.1853036880493164,
1211
+ "learning_rate": 4.3230213361241894e-05,
1212
+ "loss": 1.2829,
1213
+ "step": 860
1214
+ },
1215
+ {
1216
+ "epoch": 0.7234709879769995,
1217
+ "grad_norm": 2.2859535217285156,
1218
+ "learning_rate": 4.3155081659002506e-05,
1219
+ "loss": 1.3128,
1220
+ "step": 865
1221
+ },
1222
+ {
1223
+ "epoch": 0.7276529012023001,
1224
+ "grad_norm": 1.9847872257232666,
1225
+ "learning_rate": 4.3079601411691775e-05,
1226
+ "loss": 1.3126,
1227
+ "step": 870
1228
+ },
1229
+ {
1230
+ "epoch": 0.7318348144276007,
1231
+ "grad_norm": 2.3204383850097656,
1232
+ "learning_rate": 4.3003774068395355e-05,
1233
+ "loss": 1.3275,
1234
+ "step": 875
1235
+ },
1236
+ {
1237
+ "epoch": 0.7360167276529012,
1238
+ "grad_norm": 2.1831467151641846,
1239
+ "learning_rate": 4.292760108486251e-05,
1240
+ "loss": 1.3232,
1241
+ "step": 880
1242
+ },
1243
+ {
1244
+ "epoch": 0.7401986408782018,
1245
+ "grad_norm": 2.241342544555664,
1246
+ "learning_rate": 4.2851083923478186e-05,
1247
+ "loss": 1.2484,
1248
+ "step": 885
1249
+ },
1250
+ {
1251
+ "epoch": 0.7443805541035023,
1252
+ "grad_norm": 2.335024118423462,
1253
+ "learning_rate": 4.27742240532349e-05,
1254
+ "loss": 1.2466,
1255
+ "step": 890
1256
+ },
1257
+ {
1258
+ "epoch": 0.7485624673288029,
1259
+ "grad_norm": 2.592941999435425,
1260
+ "learning_rate": 4.269702294970461e-05,
1261
+ "loss": 1.3111,
1262
+ "step": 895
1263
+ },
1264
+ {
1265
+ "epoch": 0.7527443805541035,
1266
+ "grad_norm": 2.1964266300201416,
1267
+ "learning_rate": 4.26194820950103e-05,
1268
+ "loss": 1.3172,
1269
+ "step": 900
1270
+ },
1271
+ {
1272
+ "epoch": 0.7569262937794041,
1273
+ "grad_norm": 2.253929853439331,
1274
+ "learning_rate": 4.25416029777976e-05,
1275
+ "loss": 1.2977,
1276
+ "step": 905
1277
+ },
1278
+ {
1279
+ "epoch": 0.7611082070047046,
1280
+ "grad_norm": 2.3653616905212402,
1281
+ "learning_rate": 4.246338709320615e-05,
1282
+ "loss": 1.3397,
1283
+ "step": 910
1284
+ },
1285
+ {
1286
+ "epoch": 0.7652901202300052,
1287
+ "grad_norm": 1.9790107011795044,
1288
+ "learning_rate": 4.238483594284094e-05,
1289
+ "loss": 1.299,
1290
+ "step": 915
1291
+ },
1292
+ {
1293
+ "epoch": 0.7694720334553058,
1294
+ "grad_norm": 2.177802324295044,
1295
+ "learning_rate": 4.230595103474345e-05,
1296
+ "loss": 1.2083,
1297
+ "step": 920
1298
+ },
1299
+ {
1300
+ "epoch": 0.7736539466806064,
1301
+ "grad_norm": 2.267620325088501,
1302
+ "learning_rate": 4.222673388336272e-05,
1303
+ "loss": 1.2712,
1304
+ "step": 925
1305
+ },
1306
+ {
1307
+ "epoch": 0.777835859905907,
1308
+ "grad_norm": 2.323590040206909,
1309
+ "learning_rate": 4.214718600952627e-05,
1310
+ "loss": 1.2944,
1311
+ "step": 930
1312
+ },
1313
+ {
1314
+ "epoch": 0.7820177731312076,
1315
+ "grad_norm": 2.2184839248657227,
1316
+ "learning_rate": 4.2067308940410874e-05,
1317
+ "loss": 1.3033,
1318
+ "step": 935
1319
+ },
1320
+ {
1321
+ "epoch": 0.786199686356508,
1322
+ "grad_norm": 2.056938648223877,
1323
+ "learning_rate": 4.1987104209513295e-05,
1324
+ "loss": 1.2563,
1325
+ "step": 940
1326
+ },
1327
+ {
1328
+ "epoch": 0.7903815995818086,
1329
+ "grad_norm": 2.0813541412353516,
1330
+ "learning_rate": 4.1906573356620795e-05,
1331
+ "loss": 1.3169,
1332
+ "step": 945
1333
+ },
1334
+ {
1335
+ "epoch": 0.7945635128071092,
1336
+ "grad_norm": 2.2418994903564453,
1337
+ "learning_rate": 4.182571792778163e-05,
1338
+ "loss": 1.3366,
1339
+ "step": 950
1340
+ },
1341
+ {
1342
+ "epoch": 0.7987454260324098,
1343
+ "grad_norm": 2.1746504306793213,
1344
+ "learning_rate": 4.1744539475275276e-05,
1345
+ "loss": 1.2695,
1346
+ "step": 955
1347
+ },
1348
+ {
1349
+ "epoch": 0.8029273392577104,
1350
+ "grad_norm": 2.385899066925049,
1351
+ "learning_rate": 4.1663039557582725e-05,
1352
+ "loss": 1.275,
1353
+ "step": 960
1354
+ },
1355
+ {
1356
+ "epoch": 0.807109252483011,
1357
+ "grad_norm": 2.4620730876922607,
1358
+ "learning_rate": 4.158121973935653e-05,
1359
+ "loss": 1.3441,
1360
+ "step": 965
1361
+ },
1362
+ {
1363
+ "epoch": 0.8112911657083115,
1364
+ "grad_norm": 2.090711832046509,
1365
+ "learning_rate": 4.149908159139073e-05,
1366
+ "loss": 1.2754,
1367
+ "step": 970
1368
+ },
1369
+ {
1370
+ "epoch": 0.8154730789336121,
1371
+ "grad_norm": 2.367952346801758,
1372
+ "learning_rate": 4.141662669059076e-05,
1373
+ "loss": 1.2821,
1374
+ "step": 975
1375
+ },
1376
+ {
1377
+ "epoch": 0.8196549921589127,
1378
+ "grad_norm": 2.406475782394409,
1379
+ "learning_rate": 4.133385661994312e-05,
1380
+ "loss": 1.2917,
1381
+ "step": 980
1382
+ },
1383
+ {
1384
+ "epoch": 0.8238369053842133,
1385
+ "grad_norm": 2.0637738704681396,
1386
+ "learning_rate": 4.125077296848501e-05,
1387
+ "loss": 1.2582,
1388
+ "step": 985
1389
+ },
1390
+ {
1391
+ "epoch": 0.8280188186095139,
1392
+ "grad_norm": 2.3852176666259766,
1393
+ "learning_rate": 4.1167377331273825e-05,
1394
+ "loss": 1.2278,
1395
+ "step": 990
1396
+ },
1397
+ {
1398
+ "epoch": 0.8322007318348145,
1399
+ "grad_norm": 2.0237960815429688,
1400
+ "learning_rate": 4.1083671309356526e-05,
1401
+ "loss": 1.3461,
1402
+ "step": 995
1403
+ },
1404
+ {
1405
+ "epoch": 0.836382645060115,
1406
+ "grad_norm": 2.458031177520752,
1407
+ "learning_rate": 4.0999656509738904e-05,
1408
+ "loss": 1.2484,
1409
+ "step": 1000
1410
+ }
1411
+ ],
1412
+ "logging_steps": 5,
1413
+ "max_steps": 3585,
1414
+ "num_input_tokens_seen": 0,
1415
+ "num_train_epochs": 3,
1416
+ "save_steps": 100,
1417
+ "stateful_callbacks": {
1418
+ "TrainerControl": {
1419
+ "args": {
1420
+ "should_epoch_stop": false,
1421
+ "should_evaluate": false,
1422
+ "should_log": false,
1423
+ "should_save": true,
1424
+ "should_training_stop": false
1425
+ },
1426
+ "attributes": {}
1427
+ }
1428
+ },
1429
+ "total_flos": 1.2932056998425395e+18,
1430
+ "train_batch_size": 2,
1431
+ "trial_name": null,
1432
+ "trial_params": null
1433
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ad489884f87ec96cc1e49b25622db5c6c3c1eafcad1be5306265a7460b6619a
3
+ size 5304
checkpoint-1100/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.3
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-1100/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
checkpoint-1100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68ae2337969e49f4d260bc6f2c040def77384c87ba80728caae17d7611cf1d6b
3
+ size 13648432
checkpoint-1100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1b11dcfa1e30bf17cb557d11e79c056b3a788d0a65364c69856dbdcb564c1a0
3
+ size 27370618
checkpoint-1100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
3
+ size 14244
checkpoint-1100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bafd80ca1b4b768a36222bdb98758dc12047628d69993444a379c4cc3ee39ab9
3
+ size 1064
checkpoint-1100/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1100/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
checkpoint-1100/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1100/trainer_state.json ADDED
@@ -0,0 +1,1573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9200209095661265,
5
+ "eval_steps": 500,
6
+ "global_step": 1100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.004181913225300575,
13
+ "grad_norm": 3.3295910358428955,
14
+ "learning_rate": 4.9999760022374266e-05,
15
+ "loss": 1.8032,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.00836382645060115,
20
+ "grad_norm": 2.002122163772583,
21
+ "learning_rate": 4.999904009410418e-05,
22
+ "loss": 1.5943,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.012545739675901725,
27
+ "grad_norm": 1.9293591976165771,
28
+ "learning_rate": 4.9997840229011085e-05,
29
+ "loss": 1.5378,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.0167276529012023,
34
+ "grad_norm": 1.936969518661499,
35
+ "learning_rate": 4.999616045013025e-05,
36
+ "loss": 1.5856,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.020909566126502875,
41
+ "grad_norm": 1.8237314224243164,
42
+ "learning_rate": 4.9994000789710415e-05,
43
+ "loss": 1.5066,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.02509147935180345,
48
+ "grad_norm": 1.8101543188095093,
49
+ "learning_rate": 4.9991361289213203e-05,
50
+ "loss": 1.4499,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.029273392577104027,
55
+ "grad_norm": 1.647802710533142,
56
+ "learning_rate": 4.998824199931228e-05,
57
+ "loss": 1.4694,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.0334553058024046,
62
+ "grad_norm": 1.7077428102493286,
63
+ "learning_rate": 4.998464297989245e-05,
64
+ "loss": 1.4945,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.03763721902770518,
69
+ "grad_norm": 1.7119005918502808,
70
+ "learning_rate": 4.998056430004844e-05,
71
+ "loss": 1.4257,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.04181913225300575,
76
+ "grad_norm": 2.0229578018188477,
77
+ "learning_rate": 4.997600603808359e-05,
78
+ "loss": 1.4713,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.04600104547830632,
83
+ "grad_norm": 1.885362148284912,
84
+ "learning_rate": 4.997096828150838e-05,
85
+ "loss": 1.5145,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.0501829587036069,
90
+ "grad_norm": 2.0357539653778076,
91
+ "learning_rate": 4.9965451127038714e-05,
92
+ "loss": 1.3811,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.054364871928907474,
97
+ "grad_norm": 1.9780900478363037,
98
+ "learning_rate": 4.9959454680594086e-05,
99
+ "loss": 1.412,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.05854678515420805,
104
+ "grad_norm": 1.6695700883865356,
105
+ "learning_rate": 4.995297905729554e-05,
106
+ "loss": 1.4093,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.06272869837950862,
111
+ "grad_norm": 1.9424223899841309,
112
+ "learning_rate": 4.994602438146344e-05,
113
+ "loss": 1.4615,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.0669106116048092,
118
+ "grad_norm": 1.9082552194595337,
119
+ "learning_rate": 4.9938590786615126e-05,
120
+ "loss": 1.4169,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.07109252483010978,
125
+ "grad_norm": 1.8944685459136963,
126
+ "learning_rate": 4.993067841546231e-05,
127
+ "loss": 1.3064,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.07527443805541036,
132
+ "grad_norm": 1.9688103199005127,
133
+ "learning_rate": 4.992228741990834e-05,
134
+ "loss": 1.3778,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.07945635128071092,
139
+ "grad_norm": 2.0939650535583496,
140
+ "learning_rate": 4.991341796104534e-05,
141
+ "loss": 1.3905,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.0836382645060115,
146
+ "grad_norm": 1.868986964225769,
147
+ "learning_rate": 4.9904070209151015e-05,
148
+ "loss": 1.3723,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 0.08782017773131208,
153
+ "grad_norm": 2.069476842880249,
154
+ "learning_rate": 4.989424434368549e-05,
155
+ "loss": 1.4046,
156
+ "step": 105
157
+ },
158
+ {
159
+ "epoch": 0.09200209095661264,
160
+ "grad_norm": 2.1500356197357178,
161
+ "learning_rate": 4.988394055328779e-05,
162
+ "loss": 1.4319,
163
+ "step": 110
164
+ },
165
+ {
166
+ "epoch": 0.09618400418191322,
167
+ "grad_norm": 1.9786696434020996,
168
+ "learning_rate": 4.987315903577223e-05,
169
+ "loss": 1.4203,
170
+ "step": 115
171
+ },
172
+ {
173
+ "epoch": 0.1003659174072138,
174
+ "grad_norm": 2.035172939300537,
175
+ "learning_rate": 4.986189999812468e-05,
176
+ "loss": 1.4046,
177
+ "step": 120
178
+ },
179
+ {
180
+ "epoch": 0.10454783063251437,
181
+ "grad_norm": 1.9887930154800415,
182
+ "learning_rate": 4.985016365649848e-05,
183
+ "loss": 1.3836,
184
+ "step": 125
185
+ },
186
+ {
187
+ "epoch": 0.10872974385781495,
188
+ "grad_norm": 1.8339554071426392,
189
+ "learning_rate": 4.983795023621041e-05,
190
+ "loss": 1.3468,
191
+ "step": 130
192
+ },
193
+ {
194
+ "epoch": 0.11291165708311553,
195
+ "grad_norm": 2.06648588180542,
196
+ "learning_rate": 4.982525997173625e-05,
197
+ "loss": 1.3481,
198
+ "step": 135
199
+ },
200
+ {
201
+ "epoch": 0.1170935703084161,
202
+ "grad_norm": 2.1179702281951904,
203
+ "learning_rate": 4.9812093106706376e-05,
204
+ "loss": 1.4475,
205
+ "step": 140
206
+ },
207
+ {
208
+ "epoch": 0.12127548353371667,
209
+ "grad_norm": 2.1708436012268066,
210
+ "learning_rate": 4.979844989390104e-05,
211
+ "loss": 1.3662,
212
+ "step": 145
213
+ },
214
+ {
215
+ "epoch": 0.12545739675901724,
216
+ "grad_norm": 2.0533807277679443,
217
+ "learning_rate": 4.978433059524548e-05,
218
+ "loss": 1.3856,
219
+ "step": 150
220
+ },
221
+ {
222
+ "epoch": 0.12963930998431783,
223
+ "grad_norm": 1.9418696165084839,
224
+ "learning_rate": 4.976973548180498e-05,
225
+ "loss": 1.357,
226
+ "step": 155
227
+ },
228
+ {
229
+ "epoch": 0.1338212232096184,
230
+ "grad_norm": 2.1071596145629883,
231
+ "learning_rate": 4.975466483377959e-05,
232
+ "loss": 1.3613,
233
+ "step": 160
234
+ },
235
+ {
236
+ "epoch": 0.138003136434919,
237
+ "grad_norm": 2.3704464435577393,
238
+ "learning_rate": 4.9739118940498766e-05,
239
+ "loss": 1.3985,
240
+ "step": 165
241
+ },
242
+ {
243
+ "epoch": 0.14218504966021955,
244
+ "grad_norm": 2.1726176738739014,
245
+ "learning_rate": 4.9723098100415844e-05,
246
+ "loss": 1.4361,
247
+ "step": 170
248
+ },
249
+ {
250
+ "epoch": 0.14636696288552012,
251
+ "grad_norm": 2.608517646789551,
252
+ "learning_rate": 4.970660262110227e-05,
253
+ "loss": 1.3565,
254
+ "step": 175
255
+ },
256
+ {
257
+ "epoch": 0.1505488761108207,
258
+ "grad_norm": 2.185563564300537,
259
+ "learning_rate": 4.968963281924173e-05,
260
+ "loss": 1.2957,
261
+ "step": 180
262
+ },
263
+ {
264
+ "epoch": 0.15473078933612128,
265
+ "grad_norm": 1.9632441997528076,
266
+ "learning_rate": 4.967218902062403e-05,
267
+ "loss": 1.2833,
268
+ "step": 185
269
+ },
270
+ {
271
+ "epoch": 0.15891270256142184,
272
+ "grad_norm": 2.303462028503418,
273
+ "learning_rate": 4.96542715601389e-05,
274
+ "loss": 1.3238,
275
+ "step": 190
276
+ },
277
+ {
278
+ "epoch": 0.16309461578672244,
279
+ "grad_norm": 2.3211796283721924,
280
+ "learning_rate": 4.9635880781769495e-05,
281
+ "loss": 1.3848,
282
+ "step": 195
283
+ },
284
+ {
285
+ "epoch": 0.167276529012023,
286
+ "grad_norm": 2.0993776321411133,
287
+ "learning_rate": 4.961701703858584e-05,
288
+ "loss": 1.3997,
289
+ "step": 200
290
+ },
291
+ {
292
+ "epoch": 0.17145844223732357,
293
+ "grad_norm": 2.210942506790161,
294
+ "learning_rate": 4.9597680692738056e-05,
295
+ "loss": 1.334,
296
+ "step": 205
297
+ },
298
+ {
299
+ "epoch": 0.17564035546262416,
300
+ "grad_norm": 2.1277225017547607,
301
+ "learning_rate": 4.957787211544935e-05,
302
+ "loss": 1.3338,
303
+ "step": 210
304
+ },
305
+ {
306
+ "epoch": 0.17982226868792472,
307
+ "grad_norm": 2.287722587585449,
308
+ "learning_rate": 4.9557591687008966e-05,
309
+ "loss": 1.3621,
310
+ "step": 215
311
+ },
312
+ {
313
+ "epoch": 0.1840041819132253,
314
+ "grad_norm": 2.113842010498047,
315
+ "learning_rate": 4.9536839796764825e-05,
316
+ "loss": 1.3808,
317
+ "step": 220
318
+ },
319
+ {
320
+ "epoch": 0.18818609513852588,
321
+ "grad_norm": 2.0642735958099365,
322
+ "learning_rate": 4.951561684311608e-05,
323
+ "loss": 1.3429,
324
+ "step": 225
325
+ },
326
+ {
327
+ "epoch": 0.19236800836382645,
328
+ "grad_norm": 2.2026379108428955,
329
+ "learning_rate": 4.9493923233505435e-05,
330
+ "loss": 1.2855,
331
+ "step": 230
332
+ },
333
+ {
334
+ "epoch": 0.196549921589127,
335
+ "grad_norm": 2.2843308448791504,
336
+ "learning_rate": 4.947175938441138e-05,
337
+ "loss": 1.3432,
338
+ "step": 235
339
+ },
340
+ {
341
+ "epoch": 0.2007318348144276,
342
+ "grad_norm": 2.126749038696289,
343
+ "learning_rate": 4.9449125721340145e-05,
344
+ "loss": 1.3753,
345
+ "step": 240
346
+ },
347
+ {
348
+ "epoch": 0.20491374803972817,
349
+ "grad_norm": 2.514599561691284,
350
+ "learning_rate": 4.942602267881755e-05,
351
+ "loss": 1.3101,
352
+ "step": 245
353
+ },
354
+ {
355
+ "epoch": 0.20909566126502874,
356
+ "grad_norm": 2.2043542861938477,
357
+ "learning_rate": 4.940245070038064e-05,
358
+ "loss": 1.3395,
359
+ "step": 250
360
+ },
361
+ {
362
+ "epoch": 0.21327757449032933,
363
+ "grad_norm": 2.152742862701416,
364
+ "learning_rate": 4.937841023856923e-05,
365
+ "loss": 1.3188,
366
+ "step": 255
367
+ },
368
+ {
369
+ "epoch": 0.2174594877156299,
370
+ "grad_norm": 2.221057415008545,
371
+ "learning_rate": 4.935390175491716e-05,
372
+ "loss": 1.4205,
373
+ "step": 260
374
+ },
375
+ {
376
+ "epoch": 0.2216414009409305,
377
+ "grad_norm": 2.600109100341797,
378
+ "learning_rate": 4.932892571994342e-05,
379
+ "loss": 1.3499,
380
+ "step": 265
381
+ },
382
+ {
383
+ "epoch": 0.22582331416623105,
384
+ "grad_norm": 2.2291276454925537,
385
+ "learning_rate": 4.9303482613143194e-05,
386
+ "loss": 1.3984,
387
+ "step": 270
388
+ },
389
+ {
390
+ "epoch": 0.23000522739153162,
391
+ "grad_norm": 2.338000774383545,
392
+ "learning_rate": 4.9277572922978586e-05,
393
+ "loss": 1.344,
394
+ "step": 275
395
+ },
396
+ {
397
+ "epoch": 0.2341871406168322,
398
+ "grad_norm": 2.3423988819122314,
399
+ "learning_rate": 4.925119714686928e-05,
400
+ "loss": 1.2696,
401
+ "step": 280
402
+ },
403
+ {
404
+ "epoch": 0.23836905384213278,
405
+ "grad_norm": 2.433870553970337,
406
+ "learning_rate": 4.9224355791182955e-05,
407
+ "loss": 1.3903,
408
+ "step": 285
409
+ },
410
+ {
411
+ "epoch": 0.24255096706743334,
412
+ "grad_norm": 2.281836986541748,
413
+ "learning_rate": 4.919704937122559e-05,
414
+ "loss": 1.3694,
415
+ "step": 290
416
+ },
417
+ {
418
+ "epoch": 0.24673288029273394,
419
+ "grad_norm": 2.2429168224334717,
420
+ "learning_rate": 4.916927841123159e-05,
421
+ "loss": 1.3541,
422
+ "step": 295
423
+ },
424
+ {
425
+ "epoch": 0.2509147935180345,
426
+ "grad_norm": 2.4559881687164307,
427
+ "learning_rate": 4.9141043444353674e-05,
428
+ "loss": 1.3795,
429
+ "step": 300
430
+ },
431
+ {
432
+ "epoch": 0.25509670674333507,
433
+ "grad_norm": 2.7529919147491455,
434
+ "learning_rate": 4.911234501265266e-05,
435
+ "loss": 1.386,
436
+ "step": 305
437
+ },
438
+ {
439
+ "epoch": 0.25927861996863566,
440
+ "grad_norm": 2.229973554611206,
441
+ "learning_rate": 4.9083183667087064e-05,
442
+ "loss": 1.3653,
443
+ "step": 310
444
+ },
445
+ {
446
+ "epoch": 0.26346053319393625,
447
+ "grad_norm": 2.38677716255188,
448
+ "learning_rate": 4.9053559967502535e-05,
449
+ "loss": 1.3254,
450
+ "step": 315
451
+ },
452
+ {
453
+ "epoch": 0.2676424464192368,
454
+ "grad_norm": 2.292898416519165,
455
+ "learning_rate": 4.9023474482621075e-05,
456
+ "loss": 1.3756,
457
+ "step": 320
458
+ },
459
+ {
460
+ "epoch": 0.2718243596445374,
461
+ "grad_norm": 2.08819317817688,
462
+ "learning_rate": 4.899292779003014e-05,
463
+ "loss": 1.3286,
464
+ "step": 325
465
+ },
466
+ {
467
+ "epoch": 0.276006272869838,
468
+ "grad_norm": 2.1810929775238037,
469
+ "learning_rate": 4.896192047617156e-05,
470
+ "loss": 1.3884,
471
+ "step": 330
472
+ },
473
+ {
474
+ "epoch": 0.2801881860951385,
475
+ "grad_norm": 2.2136948108673096,
476
+ "learning_rate": 4.893045313633025e-05,
477
+ "loss": 1.2723,
478
+ "step": 335
479
+ },
480
+ {
481
+ "epoch": 0.2843700993204391,
482
+ "grad_norm": 2.3611645698547363,
483
+ "learning_rate": 4.8898526374622815e-05,
484
+ "loss": 1.3758,
485
+ "step": 340
486
+ },
487
+ {
488
+ "epoch": 0.2885520125457397,
489
+ "grad_norm": 2.2939200401306152,
490
+ "learning_rate": 4.886614080398594e-05,
491
+ "loss": 1.3727,
492
+ "step": 345
493
+ },
494
+ {
495
+ "epoch": 0.29273392577104024,
496
+ "grad_norm": 2.2382090091705322,
497
+ "learning_rate": 4.8833297046164594e-05,
498
+ "loss": 1.3412,
499
+ "step": 350
500
+ },
501
+ {
502
+ "epoch": 0.29691583899634083,
503
+ "grad_norm": 2.0420024394989014,
504
+ "learning_rate": 4.8799995731700155e-05,
505
+ "loss": 1.3378,
506
+ "step": 355
507
+ },
508
+ {
509
+ "epoch": 0.3010977522216414,
510
+ "grad_norm": 2.180814266204834,
511
+ "learning_rate": 4.8766237499918244e-05,
512
+ "loss": 1.3305,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 0.30527966544694196,
517
+ "grad_norm": 2.641951560974121,
518
+ "learning_rate": 4.873202299891649e-05,
519
+ "loss": 1.3084,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 0.30946157867224255,
524
+ "grad_norm": 2.421492099761963,
525
+ "learning_rate": 4.8697352885552077e-05,
526
+ "loss": 1.3321,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 0.31364349189754315,
531
+ "grad_norm": 2.1777994632720947,
532
+ "learning_rate": 4.866222782542912e-05,
533
+ "loss": 1.3605,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 0.3178254051228437,
538
+ "grad_norm": 2.097909450531006,
539
+ "learning_rate": 4.862664849288589e-05,
540
+ "loss": 1.3786,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 0.3220073183481443,
545
+ "grad_norm": 2.3216776847839355,
546
+ "learning_rate": 4.8590615570981904e-05,
547
+ "loss": 1.3467,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 0.32618923157344487,
552
+ "grad_norm": 2.199545383453369,
553
+ "learning_rate": 4.855412975148475e-05,
554
+ "loss": 1.3534,
555
+ "step": 390
556
+ },
557
+ {
558
+ "epoch": 0.3303711447987454,
559
+ "grad_norm": 2.4035983085632324,
560
+ "learning_rate": 4.851719173485686e-05,
561
+ "loss": 1.3545,
562
+ "step": 395
563
+ },
564
+ {
565
+ "epoch": 0.334553058024046,
566
+ "grad_norm": 2.130056142807007,
567
+ "learning_rate": 4.847980223024205e-05,
568
+ "loss": 1.3574,
569
+ "step": 400
570
+ },
571
+ {
572
+ "epoch": 0.3387349712493466,
573
+ "grad_norm": 2.2835471630096436,
574
+ "learning_rate": 4.8441961955451865e-05,
575
+ "loss": 1.2835,
576
+ "step": 405
577
+ },
578
+ {
579
+ "epoch": 0.34291688447464713,
580
+ "grad_norm": 2.129136085510254,
581
+ "learning_rate": 4.840367163695186e-05,
582
+ "loss": 1.3066,
583
+ "step": 410
584
+ },
585
+ {
586
+ "epoch": 0.3470987976999477,
587
+ "grad_norm": 2.199307680130005,
588
+ "learning_rate": 4.8364932009847614e-05,
589
+ "loss": 1.3891,
590
+ "step": 415
591
+ },
592
+ {
593
+ "epoch": 0.3512807109252483,
594
+ "grad_norm": 2.240877866744995,
595
+ "learning_rate": 4.8325743817870614e-05,
596
+ "loss": 1.3131,
597
+ "step": 420
598
+ },
599
+ {
600
+ "epoch": 0.35546262415054886,
601
+ "grad_norm": 2.4342916011810303,
602
+ "learning_rate": 4.8286107813364015e-05,
603
+ "loss": 1.2903,
604
+ "step": 425
605
+ },
606
+ {
607
+ "epoch": 0.35964453737584945,
608
+ "grad_norm": 2.323018789291382,
609
+ "learning_rate": 4.824602475726815e-05,
610
+ "loss": 1.3016,
611
+ "step": 430
612
+ },
613
+ {
614
+ "epoch": 0.36382645060115004,
615
+ "grad_norm": 2.2159998416900635,
616
+ "learning_rate": 4.820549541910595e-05,
617
+ "loss": 1.3312,
618
+ "step": 435
619
+ },
620
+ {
621
+ "epoch": 0.3680083638264506,
622
+ "grad_norm": 2.654232978820801,
623
+ "learning_rate": 4.8164520576968165e-05,
624
+ "loss": 1.3793,
625
+ "step": 440
626
+ },
627
+ {
628
+ "epoch": 0.3721902770517512,
629
+ "grad_norm": 2.3524513244628906,
630
+ "learning_rate": 4.8123101017498416e-05,
631
+ "loss": 1.3406,
632
+ "step": 445
633
+ },
634
+ {
635
+ "epoch": 0.37637219027705177,
636
+ "grad_norm": 2.3215346336364746,
637
+ "learning_rate": 4.8081237535878116e-05,
638
+ "loss": 1.3997,
639
+ "step": 450
640
+ },
641
+ {
642
+ "epoch": 0.3805541035023523,
643
+ "grad_norm": 2.0394012928009033,
644
+ "learning_rate": 4.803893093581117e-05,
645
+ "loss": 1.3084,
646
+ "step": 455
647
+ },
648
+ {
649
+ "epoch": 0.3847360167276529,
650
+ "grad_norm": 2.251598596572876,
651
+ "learning_rate": 4.799618202950857e-05,
652
+ "loss": 1.3396,
653
+ "step": 460
654
+ },
655
+ {
656
+ "epoch": 0.3889179299529535,
657
+ "grad_norm": 2.4240410327911377,
658
+ "learning_rate": 4.795299163767282e-05,
659
+ "loss": 1.359,
660
+ "step": 465
661
+ },
662
+ {
663
+ "epoch": 0.393099843178254,
664
+ "grad_norm": 2.2180769443511963,
665
+ "learning_rate": 4.790936058948211e-05,
666
+ "loss": 1.3575,
667
+ "step": 470
668
+ },
669
+ {
670
+ "epoch": 0.3972817564035546,
671
+ "grad_norm": 2.2782742977142334,
672
+ "learning_rate": 4.786528972257449e-05,
673
+ "loss": 1.2915,
674
+ "step": 475
675
+ },
676
+ {
677
+ "epoch": 0.4014636696288552,
678
+ "grad_norm": 2.2193851470947266,
679
+ "learning_rate": 4.7820779883031696e-05,
680
+ "loss": 1.3185,
681
+ "step": 480
682
+ },
683
+ {
684
+ "epoch": 0.40564558285415575,
685
+ "grad_norm": 2.2422704696655273,
686
+ "learning_rate": 4.7775831925363e-05,
687
+ "loss": 1.2638,
688
+ "step": 485
689
+ },
690
+ {
691
+ "epoch": 0.40982749607945634,
692
+ "grad_norm": 2.430154323577881,
693
+ "learning_rate": 4.773044671248872e-05,
694
+ "loss": 1.3616,
695
+ "step": 490
696
+ },
697
+ {
698
+ "epoch": 0.41400940930475694,
699
+ "grad_norm": 2.2115910053253174,
700
+ "learning_rate": 4.768462511572371e-05,
701
+ "loss": 1.35,
702
+ "step": 495
703
+ },
704
+ {
705
+ "epoch": 0.4181913225300575,
706
+ "grad_norm": 2.063385486602783,
707
+ "learning_rate": 4.763836801476061e-05,
708
+ "loss": 1.2797,
709
+ "step": 500
710
+ },
711
+ {
712
+ "epoch": 0.42237323575535807,
713
+ "grad_norm": 2.3508410453796387,
714
+ "learning_rate": 4.759167629765297e-05,
715
+ "loss": 1.3173,
716
+ "step": 505
717
+ },
718
+ {
719
+ "epoch": 0.42655514898065866,
720
+ "grad_norm": 2.3848047256469727,
721
+ "learning_rate": 4.7544550860798177e-05,
722
+ "loss": 1.2502,
723
+ "step": 510
724
+ },
725
+ {
726
+ "epoch": 0.43073706220595925,
727
+ "grad_norm": 2.227524757385254,
728
+ "learning_rate": 4.749699260892026e-05,
729
+ "loss": 1.3174,
730
+ "step": 515
731
+ },
732
+ {
733
+ "epoch": 0.4349189754312598,
734
+ "grad_norm": 2.2080676555633545,
735
+ "learning_rate": 4.744900245505253e-05,
736
+ "loss": 1.2567,
737
+ "step": 520
738
+ },
739
+ {
740
+ "epoch": 0.4391008886565604,
741
+ "grad_norm": 2.2167842388153076,
742
+ "learning_rate": 4.7400581320520055e-05,
743
+ "loss": 1.2906,
744
+ "step": 525
745
+ },
746
+ {
747
+ "epoch": 0.443282801881861,
748
+ "grad_norm": 2.4347431659698486,
749
+ "learning_rate": 4.735173013492193e-05,
750
+ "loss": 1.3109,
751
+ "step": 530
752
+ },
753
+ {
754
+ "epoch": 0.4474647151071615,
755
+ "grad_norm": 2.024902105331421,
756
+ "learning_rate": 4.73024498361135e-05,
757
+ "loss": 1.3265,
758
+ "step": 535
759
+ },
760
+ {
761
+ "epoch": 0.4516466283324621,
762
+ "grad_norm": 2.1333351135253906,
763
+ "learning_rate": 4.725274137018826e-05,
764
+ "loss": 1.3426,
765
+ "step": 540
766
+ },
767
+ {
768
+ "epoch": 0.4558285415577627,
769
+ "grad_norm": 2.400423049926758,
770
+ "learning_rate": 4.720260569145981e-05,
771
+ "loss": 1.2751,
772
+ "step": 545
773
+ },
774
+ {
775
+ "epoch": 0.46001045478306324,
776
+ "grad_norm": 2.5796918869018555,
777
+ "learning_rate": 4.715204376244343e-05,
778
+ "loss": 1.3453,
779
+ "step": 550
780
+ },
781
+ {
782
+ "epoch": 0.46419236800836383,
783
+ "grad_norm": 2.339895486831665,
784
+ "learning_rate": 4.7101056553837665e-05,
785
+ "loss": 1.2667,
786
+ "step": 555
787
+ },
788
+ {
789
+ "epoch": 0.4683742812336644,
790
+ "grad_norm": 2.273618221282959,
791
+ "learning_rate": 4.704964504450563e-05,
792
+ "loss": 1.3264,
793
+ "step": 560
794
+ },
795
+ {
796
+ "epoch": 0.47255619445896496,
797
+ "grad_norm": 1.9942044019699097,
798
+ "learning_rate": 4.69978102214563e-05,
799
+ "loss": 1.2907,
800
+ "step": 565
801
+ },
802
+ {
803
+ "epoch": 0.47673810768426556,
804
+ "grad_norm": 2.1848580837249756,
805
+ "learning_rate": 4.694555307982551e-05,
806
+ "loss": 1.2921,
807
+ "step": 570
808
+ },
809
+ {
810
+ "epoch": 0.48092002090956615,
811
+ "grad_norm": 2.2914299964904785,
812
+ "learning_rate": 4.689287462285681e-05,
813
+ "loss": 1.3612,
814
+ "step": 575
815
+ },
816
+ {
817
+ "epoch": 0.4851019341348667,
818
+ "grad_norm": 2.1165218353271484,
819
+ "learning_rate": 4.6839775861882306e-05,
820
+ "loss": 1.3419,
821
+ "step": 580
822
+ },
823
+ {
824
+ "epoch": 0.4892838473601673,
825
+ "grad_norm": 2.003962516784668,
826
+ "learning_rate": 4.678625781630315e-05,
827
+ "loss": 1.388,
828
+ "step": 585
829
+ },
830
+ {
831
+ "epoch": 0.49346576058546787,
832
+ "grad_norm": 2.366339683532715,
833
+ "learning_rate": 4.673232151357004e-05,
834
+ "loss": 1.2895,
835
+ "step": 590
836
+ },
837
+ {
838
+ "epoch": 0.4976476738107684,
839
+ "grad_norm": 2.4352052211761475,
840
+ "learning_rate": 4.667796798916343e-05,
841
+ "loss": 1.3457,
842
+ "step": 595
843
+ },
844
+ {
845
+ "epoch": 0.501829587036069,
846
+ "grad_norm": 2.4221272468566895,
847
+ "learning_rate": 4.662319828657371e-05,
848
+ "loss": 1.3298,
849
+ "step": 600
850
+ },
851
+ {
852
+ "epoch": 0.5060115002613695,
853
+ "grad_norm": 2.2551608085632324,
854
+ "learning_rate": 4.6568013457281126e-05,
855
+ "loss": 1.2192,
856
+ "step": 605
857
+ },
858
+ {
859
+ "epoch": 0.5101934134866701,
860
+ "grad_norm": 2.4560656547546387,
861
+ "learning_rate": 4.651241456073563e-05,
862
+ "loss": 1.3086,
863
+ "step": 610
864
+ },
865
+ {
866
+ "epoch": 0.5143753267119707,
867
+ "grad_norm": 2.266500473022461,
868
+ "learning_rate": 4.645640266433651e-05,
869
+ "loss": 1.2608,
870
+ "step": 615
871
+ },
872
+ {
873
+ "epoch": 0.5185572399372713,
874
+ "grad_norm": 2.175009250640869,
875
+ "learning_rate": 4.639997884341192e-05,
876
+ "loss": 1.3459,
877
+ "step": 620
878
+ },
879
+ {
880
+ "epoch": 0.5227391531625719,
881
+ "grad_norm": 2.329770803451538,
882
+ "learning_rate": 4.634314418119823e-05,
883
+ "loss": 1.2894,
884
+ "step": 625
885
+ },
886
+ {
887
+ "epoch": 0.5269210663878725,
888
+ "grad_norm": 2.545764446258545,
889
+ "learning_rate": 4.628589976881923e-05,
890
+ "loss": 1.3213,
891
+ "step": 630
892
+ },
893
+ {
894
+ "epoch": 0.531102979613173,
895
+ "grad_norm": 2.2759323120117188,
896
+ "learning_rate": 4.622824670526516e-05,
897
+ "loss": 1.3362,
898
+ "step": 635
899
+ },
900
+ {
901
+ "epoch": 0.5352848928384736,
902
+ "grad_norm": 2.341280221939087,
903
+ "learning_rate": 4.617018609737166e-05,
904
+ "loss": 1.24,
905
+ "step": 640
906
+ },
907
+ {
908
+ "epoch": 0.5394668060637742,
909
+ "grad_norm": 2.353301763534546,
910
+ "learning_rate": 4.6111719059798466e-05,
911
+ "loss": 1.3643,
912
+ "step": 645
913
+ },
914
+ {
915
+ "epoch": 0.5436487192890748,
916
+ "grad_norm": 2.223389148712158,
917
+ "learning_rate": 4.605284671500805e-05,
918
+ "loss": 1.3569,
919
+ "step": 650
920
+ },
921
+ {
922
+ "epoch": 0.5478306325143754,
923
+ "grad_norm": 2.2959463596343994,
924
+ "learning_rate": 4.599357019324405e-05,
925
+ "loss": 1.3581,
926
+ "step": 655
927
+ },
928
+ {
929
+ "epoch": 0.552012545739676,
930
+ "grad_norm": 2.1995513439178467,
931
+ "learning_rate": 4.593389063250958e-05,
932
+ "loss": 1.2962,
933
+ "step": 660
934
+ },
935
+ {
936
+ "epoch": 0.5561944589649764,
937
+ "grad_norm": 1.9802824258804321,
938
+ "learning_rate": 4.5873809178545396e-05,
939
+ "loss": 1.3121,
940
+ "step": 665
941
+ },
942
+ {
943
+ "epoch": 0.560376372190277,
944
+ "grad_norm": 2.2561113834381104,
945
+ "learning_rate": 4.581332698480786e-05,
946
+ "loss": 1.2395,
947
+ "step": 670
948
+ },
949
+ {
950
+ "epoch": 0.5645582854155776,
951
+ "grad_norm": 2.1277997493743896,
952
+ "learning_rate": 4.5752445212446836e-05,
953
+ "loss": 1.2782,
954
+ "step": 675
955
+ },
956
+ {
957
+ "epoch": 0.5687401986408782,
958
+ "grad_norm": 2.1406946182250977,
959
+ "learning_rate": 4.569116503028339e-05,
960
+ "loss": 1.3248,
961
+ "step": 680
962
+ },
963
+ {
964
+ "epoch": 0.5729221118661788,
965
+ "grad_norm": 2.190600872039795,
966
+ "learning_rate": 4.5629487614787306e-05,
967
+ "loss": 1.2975,
968
+ "step": 685
969
+ },
970
+ {
971
+ "epoch": 0.5771040250914794,
972
+ "grad_norm": 2.1068780422210693,
973
+ "learning_rate": 4.556741415005459e-05,
974
+ "loss": 1.3191,
975
+ "step": 690
976
+ },
977
+ {
978
+ "epoch": 0.5812859383167799,
979
+ "grad_norm": 2.3496341705322266,
980
+ "learning_rate": 4.5504945827784634e-05,
981
+ "loss": 1.2827,
982
+ "step": 695
983
+ },
984
+ {
985
+ "epoch": 0.5854678515420805,
986
+ "grad_norm": 2.33406400680542,
987
+ "learning_rate": 4.544208384725742e-05,
988
+ "loss": 1.2649,
989
+ "step": 700
990
+ },
991
+ {
992
+ "epoch": 0.5896497647673811,
993
+ "grad_norm": 2.365461587905884,
994
+ "learning_rate": 4.5378829415310465e-05,
995
+ "loss": 1.3121,
996
+ "step": 705
997
+ },
998
+ {
999
+ "epoch": 0.5938316779926817,
1000
+ "grad_norm": 2.186875343322754,
1001
+ "learning_rate": 4.531518374631564e-05,
1002
+ "loss": 1.3113,
1003
+ "step": 710
1004
+ },
1005
+ {
1006
+ "epoch": 0.5980135912179823,
1007
+ "grad_norm": 2.1798176765441895,
1008
+ "learning_rate": 4.525114806215584e-05,
1009
+ "loss": 1.3446,
1010
+ "step": 715
1011
+ },
1012
+ {
1013
+ "epoch": 0.6021955044432828,
1014
+ "grad_norm": 1.9714925289154053,
1015
+ "learning_rate": 4.518672359220161e-05,
1016
+ "loss": 1.2979,
1017
+ "step": 720
1018
+ },
1019
+ {
1020
+ "epoch": 0.6063774176685833,
1021
+ "grad_norm": 2.1850643157958984,
1022
+ "learning_rate": 4.5121911573287446e-05,
1023
+ "loss": 1.2412,
1024
+ "step": 725
1025
+ },
1026
+ {
1027
+ "epoch": 0.6105593308938839,
1028
+ "grad_norm": 2.0708961486816406,
1029
+ "learning_rate": 4.505671324968811e-05,
1030
+ "loss": 1.2559,
1031
+ "step": 730
1032
+ },
1033
+ {
1034
+ "epoch": 0.6147412441191845,
1035
+ "grad_norm": 2.461225986480713,
1036
+ "learning_rate": 4.49911298730947e-05,
1037
+ "loss": 1.3696,
1038
+ "step": 735
1039
+ },
1040
+ {
1041
+ "epoch": 0.6189231573444851,
1042
+ "grad_norm": 2.2155981063842773,
1043
+ "learning_rate": 4.492516270259066e-05,
1044
+ "loss": 1.3459,
1045
+ "step": 740
1046
+ },
1047
+ {
1048
+ "epoch": 0.6231050705697857,
1049
+ "grad_norm": 2.0899972915649414,
1050
+ "learning_rate": 4.48588130046276e-05,
1051
+ "loss": 1.2885,
1052
+ "step": 745
1053
+ },
1054
+ {
1055
+ "epoch": 0.6272869837950863,
1056
+ "grad_norm": 2.2798969745635986,
1057
+ "learning_rate": 4.479208205300094e-05,
1058
+ "loss": 1.299,
1059
+ "step": 750
1060
+ },
1061
+ {
1062
+ "epoch": 0.6314688970203868,
1063
+ "grad_norm": 2.4617793560028076,
1064
+ "learning_rate": 4.472497112882552e-05,
1065
+ "loss": 1.2973,
1066
+ "step": 755
1067
+ },
1068
+ {
1069
+ "epoch": 0.6356508102456874,
1070
+ "grad_norm": 2.20271635055542,
1071
+ "learning_rate": 4.465748152051096e-05,
1072
+ "loss": 1.231,
1073
+ "step": 760
1074
+ },
1075
+ {
1076
+ "epoch": 0.639832723470988,
1077
+ "grad_norm": 1.990775227546692,
1078
+ "learning_rate": 4.458961452373692e-05,
1079
+ "loss": 1.2594,
1080
+ "step": 765
1081
+ },
1082
+ {
1083
+ "epoch": 0.6440146366962886,
1084
+ "grad_norm": 2.2898495197296143,
1085
+ "learning_rate": 4.4521371441428284e-05,
1086
+ "loss": 1.2721,
1087
+ "step": 770
1088
+ },
1089
+ {
1090
+ "epoch": 0.6481965499215891,
1091
+ "grad_norm": 2.282844066619873,
1092
+ "learning_rate": 4.445275358373006e-05,
1093
+ "loss": 1.2742,
1094
+ "step": 775
1095
+ },
1096
+ {
1097
+ "epoch": 0.6523784631468897,
1098
+ "grad_norm": 2.3958911895751953,
1099
+ "learning_rate": 4.438376226798231e-05,
1100
+ "loss": 1.2897,
1101
+ "step": 780
1102
+ },
1103
+ {
1104
+ "epoch": 0.6565603763721902,
1105
+ "grad_norm": 2.2609405517578125,
1106
+ "learning_rate": 4.43143988186948e-05,
1107
+ "loss": 1.2891,
1108
+ "step": 785
1109
+ },
1110
+ {
1111
+ "epoch": 0.6607422895974908,
1112
+ "grad_norm": 2.416956663131714,
1113
+ "learning_rate": 4.42446645675216e-05,
1114
+ "loss": 1.3711,
1115
+ "step": 790
1116
+ },
1117
+ {
1118
+ "epoch": 0.6649242028227914,
1119
+ "grad_norm": 2.280547857284546,
1120
+ "learning_rate": 4.4174560853235505e-05,
1121
+ "loss": 1.3413,
1122
+ "step": 795
1123
+ },
1124
+ {
1125
+ "epoch": 0.669106116048092,
1126
+ "grad_norm": 2.055990695953369,
1127
+ "learning_rate": 4.410408902170235e-05,
1128
+ "loss": 1.2829,
1129
+ "step": 800
1130
+ },
1131
+ {
1132
+ "epoch": 0.6732880292733926,
1133
+ "grad_norm": 2.1436378955841064,
1134
+ "learning_rate": 4.403325042585518e-05,
1135
+ "loss": 1.2223,
1136
+ "step": 805
1137
+ },
1138
+ {
1139
+ "epoch": 0.6774699424986932,
1140
+ "grad_norm": 2.1957428455352783,
1141
+ "learning_rate": 4.396204642566821e-05,
1142
+ "loss": 1.2923,
1143
+ "step": 810
1144
+ },
1145
+ {
1146
+ "epoch": 0.6816518557239938,
1147
+ "grad_norm": 2.1543030738830566,
1148
+ "learning_rate": 4.389047838813082e-05,
1149
+ "loss": 1.2296,
1150
+ "step": 815
1151
+ },
1152
+ {
1153
+ "epoch": 0.6858337689492943,
1154
+ "grad_norm": 2.1455068588256836,
1155
+ "learning_rate": 4.3818547687221204e-05,
1156
+ "loss": 1.2913,
1157
+ "step": 820
1158
+ },
1159
+ {
1160
+ "epoch": 0.6900156821745949,
1161
+ "grad_norm": 2.2354376316070557,
1162
+ "learning_rate": 4.374625570388008e-05,
1163
+ "loss": 1.2647,
1164
+ "step": 825
1165
+ },
1166
+ {
1167
+ "epoch": 0.6941975953998955,
1168
+ "grad_norm": 2.2896392345428467,
1169
+ "learning_rate": 4.367360382598413e-05,
1170
+ "loss": 1.329,
1171
+ "step": 830
1172
+ },
1173
+ {
1174
+ "epoch": 0.698379508625196,
1175
+ "grad_norm": 2.0567495822906494,
1176
+ "learning_rate": 4.360059344831936e-05,
1177
+ "loss": 1.2306,
1178
+ "step": 835
1179
+ },
1180
+ {
1181
+ "epoch": 0.7025614218504966,
1182
+ "grad_norm": 2.0631346702575684,
1183
+ "learning_rate": 4.352722597255434e-05,
1184
+ "loss": 1.2788,
1185
+ "step": 840
1186
+ },
1187
+ {
1188
+ "epoch": 0.7067433350757972,
1189
+ "grad_norm": 2.3736605644226074,
1190
+ "learning_rate": 4.345350280721328e-05,
1191
+ "loss": 1.3236,
1192
+ "step": 845
1193
+ },
1194
+ {
1195
+ "epoch": 0.7109252483010977,
1196
+ "grad_norm": 2.308960437774658,
1197
+ "learning_rate": 4.337942536764901e-05,
1198
+ "loss": 1.2993,
1199
+ "step": 850
1200
+ },
1201
+ {
1202
+ "epoch": 0.7151071615263983,
1203
+ "grad_norm": 2.230039358139038,
1204
+ "learning_rate": 4.330499507601575e-05,
1205
+ "loss": 1.2544,
1206
+ "step": 855
1207
+ },
1208
+ {
1209
+ "epoch": 0.7192890747516989,
1210
+ "grad_norm": 2.1853036880493164,
1211
+ "learning_rate": 4.3230213361241894e-05,
1212
+ "loss": 1.2829,
1213
+ "step": 860
1214
+ },
1215
+ {
1216
+ "epoch": 0.7234709879769995,
1217
+ "grad_norm": 2.2859535217285156,
1218
+ "learning_rate": 4.3155081659002506e-05,
1219
+ "loss": 1.3128,
1220
+ "step": 865
1221
+ },
1222
+ {
1223
+ "epoch": 0.7276529012023001,
1224
+ "grad_norm": 1.9847872257232666,
1225
+ "learning_rate": 4.3079601411691775e-05,
1226
+ "loss": 1.3126,
1227
+ "step": 870
1228
+ },
1229
+ {
1230
+ "epoch": 0.7318348144276007,
1231
+ "grad_norm": 2.3204383850097656,
1232
+ "learning_rate": 4.3003774068395355e-05,
1233
+ "loss": 1.3275,
1234
+ "step": 875
1235
+ },
1236
+ {
1237
+ "epoch": 0.7360167276529012,
1238
+ "grad_norm": 2.1831467151641846,
1239
+ "learning_rate": 4.292760108486251e-05,
1240
+ "loss": 1.3232,
1241
+ "step": 880
1242
+ },
1243
+ {
1244
+ "epoch": 0.7401986408782018,
1245
+ "grad_norm": 2.241342544555664,
1246
+ "learning_rate": 4.2851083923478186e-05,
1247
+ "loss": 1.2484,
1248
+ "step": 885
1249
+ },
1250
+ {
1251
+ "epoch": 0.7443805541035023,
1252
+ "grad_norm": 2.335024118423462,
1253
+ "learning_rate": 4.27742240532349e-05,
1254
+ "loss": 1.2466,
1255
+ "step": 890
1256
+ },
1257
+ {
1258
+ "epoch": 0.7485624673288029,
1259
+ "grad_norm": 2.592941999435425,
1260
+ "learning_rate": 4.269702294970461e-05,
1261
+ "loss": 1.3111,
1262
+ "step": 895
1263
+ },
1264
+ {
1265
+ "epoch": 0.7527443805541035,
1266
+ "grad_norm": 2.1964266300201416,
1267
+ "learning_rate": 4.26194820950103e-05,
1268
+ "loss": 1.3172,
1269
+ "step": 900
1270
+ },
1271
+ {
1272
+ "epoch": 0.7569262937794041,
1273
+ "grad_norm": 2.253929853439331,
1274
+ "learning_rate": 4.25416029777976e-05,
1275
+ "loss": 1.2977,
1276
+ "step": 905
1277
+ },
1278
+ {
1279
+ "epoch": 0.7611082070047046,
1280
+ "grad_norm": 2.3653616905212402,
1281
+ "learning_rate": 4.246338709320615e-05,
1282
+ "loss": 1.3397,
1283
+ "step": 910
1284
+ },
1285
+ {
1286
+ "epoch": 0.7652901202300052,
1287
+ "grad_norm": 1.9790107011795044,
1288
+ "learning_rate": 4.238483594284094e-05,
1289
+ "loss": 1.299,
1290
+ "step": 915
1291
+ },
1292
+ {
1293
+ "epoch": 0.7694720334553058,
1294
+ "grad_norm": 2.177802324295044,
1295
+ "learning_rate": 4.230595103474345e-05,
1296
+ "loss": 1.2083,
1297
+ "step": 920
1298
+ },
1299
+ {
1300
+ "epoch": 0.7736539466806064,
1301
+ "grad_norm": 2.267620325088501,
1302
+ "learning_rate": 4.222673388336272e-05,
1303
+ "loss": 1.2712,
1304
+ "step": 925
1305
+ },
1306
+ {
1307
+ "epoch": 0.777835859905907,
1308
+ "grad_norm": 2.323590040206909,
1309
+ "learning_rate": 4.214718600952627e-05,
1310
+ "loss": 1.2944,
1311
+ "step": 930
1312
+ },
1313
+ {
1314
+ "epoch": 0.7820177731312076,
1315
+ "grad_norm": 2.2184839248657227,
1316
+ "learning_rate": 4.2067308940410874e-05,
1317
+ "loss": 1.3033,
1318
+ "step": 935
1319
+ },
1320
+ {
1321
+ "epoch": 0.786199686356508,
1322
+ "grad_norm": 2.056938648223877,
1323
+ "learning_rate": 4.1987104209513295e-05,
1324
+ "loss": 1.2563,
1325
+ "step": 940
1326
+ },
1327
+ {
1328
+ "epoch": 0.7903815995818086,
1329
+ "grad_norm": 2.0813541412353516,
1330
+ "learning_rate": 4.1906573356620795e-05,
1331
+ "loss": 1.3169,
1332
+ "step": 945
1333
+ },
1334
+ {
1335
+ "epoch": 0.7945635128071092,
1336
+ "grad_norm": 2.2418994903564453,
1337
+ "learning_rate": 4.182571792778163e-05,
1338
+ "loss": 1.3366,
1339
+ "step": 950
1340
+ },
1341
+ {
1342
+ "epoch": 0.7987454260324098,
1343
+ "grad_norm": 2.1746504306793213,
1344
+ "learning_rate": 4.1744539475275276e-05,
1345
+ "loss": 1.2695,
1346
+ "step": 955
1347
+ },
1348
+ {
1349
+ "epoch": 0.8029273392577104,
1350
+ "grad_norm": 2.385899066925049,
1351
+ "learning_rate": 4.1663039557582725e-05,
1352
+ "loss": 1.275,
1353
+ "step": 960
1354
+ },
1355
+ {
1356
+ "epoch": 0.807109252483011,
1357
+ "grad_norm": 2.4620730876922607,
1358
+ "learning_rate": 4.158121973935653e-05,
1359
+ "loss": 1.3441,
1360
+ "step": 965
1361
+ },
1362
+ {
1363
+ "epoch": 0.8112911657083115,
1364
+ "grad_norm": 2.090711832046509,
1365
+ "learning_rate": 4.149908159139073e-05,
1366
+ "loss": 1.2754,
1367
+ "step": 970
1368
+ },
1369
+ {
1370
+ "epoch": 0.8154730789336121,
1371
+ "grad_norm": 2.367952346801758,
1372
+ "learning_rate": 4.141662669059076e-05,
1373
+ "loss": 1.2821,
1374
+ "step": 975
1375
+ },
1376
+ {
1377
+ "epoch": 0.8196549921589127,
1378
+ "grad_norm": 2.406475782394409,
1379
+ "learning_rate": 4.133385661994312e-05,
1380
+ "loss": 1.2917,
1381
+ "step": 980
1382
+ },
1383
+ {
1384
+ "epoch": 0.8238369053842133,
1385
+ "grad_norm": 2.0637738704681396,
1386
+ "learning_rate": 4.125077296848501e-05,
1387
+ "loss": 1.2582,
1388
+ "step": 985
1389
+ },
1390
+ {
1391
+ "epoch": 0.8280188186095139,
1392
+ "grad_norm": 2.3852176666259766,
1393
+ "learning_rate": 4.1167377331273825e-05,
1394
+ "loss": 1.2278,
1395
+ "step": 990
1396
+ },
1397
+ {
1398
+ "epoch": 0.8322007318348145,
1399
+ "grad_norm": 2.0237960815429688,
1400
+ "learning_rate": 4.1083671309356526e-05,
1401
+ "loss": 1.3461,
1402
+ "step": 995
1403
+ },
1404
+ {
1405
+ "epoch": 0.836382645060115,
1406
+ "grad_norm": 2.458031177520752,
1407
+ "learning_rate": 4.0999656509738904e-05,
1408
+ "loss": 1.2484,
1409
+ "step": 1000
1410
+ },
1411
+ {
1412
+ "epoch": 0.8405645582854155,
1413
+ "grad_norm": 2.3338541984558105,
1414
+ "learning_rate": 4.0915334545354734e-05,
1415
+ "loss": 1.2526,
1416
+ "step": 1005
1417
+ },
1418
+ {
1419
+ "epoch": 0.8447464715107161,
1420
+ "grad_norm": 2.289320230484009,
1421
+ "learning_rate": 4.0830707035034795e-05,
1422
+ "loss": 1.2643,
1423
+ "step": 1010
1424
+ },
1425
+ {
1426
+ "epoch": 0.8489283847360167,
1427
+ "grad_norm": 2.461848735809326,
1428
+ "learning_rate": 4.074577560347581e-05,
1429
+ "loss": 1.29,
1430
+ "step": 1015
1431
+ },
1432
+ {
1433
+ "epoch": 0.8531102979613173,
1434
+ "grad_norm": 2.458249568939209,
1435
+ "learning_rate": 4.066054188120924e-05,
1436
+ "loss": 1.2673,
1437
+ "step": 1020
1438
+ },
1439
+ {
1440
+ "epoch": 0.8572922111866179,
1441
+ "grad_norm": 2.21874737739563,
1442
+ "learning_rate": 4.0575007504569994e-05,
1443
+ "loss": 1.3085,
1444
+ "step": 1025
1445
+ },
1446
+ {
1447
+ "epoch": 0.8614741244119185,
1448
+ "grad_norm": 2.275045871734619,
1449
+ "learning_rate": 4.0489174115665006e-05,
1450
+ "loss": 1.2335,
1451
+ "step": 1030
1452
+ },
1453
+ {
1454
+ "epoch": 0.865656037637219,
1455
+ "grad_norm": 2.1374430656433105,
1456
+ "learning_rate": 4.04030433623417e-05,
1457
+ "loss": 1.2699,
1458
+ "step": 1035
1459
+ },
1460
+ {
1461
+ "epoch": 0.8698379508625196,
1462
+ "grad_norm": 2.3316330909729004,
1463
+ "learning_rate": 4.031661689815637e-05,
1464
+ "loss": 1.2847,
1465
+ "step": 1040
1466
+ },
1467
+ {
1468
+ "epoch": 0.8740198640878202,
1469
+ "grad_norm": 2.3386001586914062,
1470
+ "learning_rate": 4.022989638234243e-05,
1471
+ "loss": 1.3394,
1472
+ "step": 1045
1473
+ },
1474
+ {
1475
+ "epoch": 0.8782017773131208,
1476
+ "grad_norm": 2.4343066215515137,
1477
+ "learning_rate": 4.0142883479778555e-05,
1478
+ "loss": 1.2541,
1479
+ "step": 1050
1480
+ },
1481
+ {
1482
+ "epoch": 0.8823836905384214,
1483
+ "grad_norm": 2.1589419841766357,
1484
+ "learning_rate": 4.005557986095673e-05,
1485
+ "loss": 1.3327,
1486
+ "step": 1055
1487
+ },
1488
+ {
1489
+ "epoch": 0.886565603763722,
1490
+ "grad_norm": 2.2223572731018066,
1491
+ "learning_rate": 3.996798720195018e-05,
1492
+ "loss": 1.3057,
1493
+ "step": 1060
1494
+ },
1495
+ {
1496
+ "epoch": 0.8907475169890224,
1497
+ "grad_norm": 2.5740420818328857,
1498
+ "learning_rate": 3.988010718438115e-05,
1499
+ "loss": 1.2945,
1500
+ "step": 1065
1501
+ },
1502
+ {
1503
+ "epoch": 0.894929430214323,
1504
+ "grad_norm": 2.214611053466797,
1505
+ "learning_rate": 3.9791941495388696e-05,
1506
+ "loss": 1.2855,
1507
+ "step": 1070
1508
+ },
1509
+ {
1510
+ "epoch": 0.8991113434396236,
1511
+ "grad_norm": 2.21602463722229,
1512
+ "learning_rate": 3.970349182759623e-05,
1513
+ "loss": 1.3271,
1514
+ "step": 1075
1515
+ },
1516
+ {
1517
+ "epoch": 0.9032932566649242,
1518
+ "grad_norm": 2.2032415866851807,
1519
+ "learning_rate": 3.9614759879079057e-05,
1520
+ "loss": 1.2027,
1521
+ "step": 1080
1522
+ },
1523
+ {
1524
+ "epoch": 0.9074751698902248,
1525
+ "grad_norm": 2.518728017807007,
1526
+ "learning_rate": 3.9525747353331746e-05,
1527
+ "loss": 1.2886,
1528
+ "step": 1085
1529
+ },
1530
+ {
1531
+ "epoch": 0.9116570831155254,
1532
+ "grad_norm": 2.438845157623291,
1533
+ "learning_rate": 3.943645595923548e-05,
1534
+ "loss": 1.2054,
1535
+ "step": 1090
1536
+ },
1537
+ {
1538
+ "epoch": 0.9158389963408259,
1539
+ "grad_norm": 2.460658073425293,
1540
+ "learning_rate": 3.934688741102521e-05,
1541
+ "loss": 1.2868,
1542
+ "step": 1095
1543
+ },
1544
+ {
1545
+ "epoch": 0.9200209095661265,
1546
+ "grad_norm": 2.09260892868042,
1547
+ "learning_rate": 3.925704342825671e-05,
1548
+ "loss": 1.3027,
1549
+ "step": 1100
1550
+ }
1551
+ ],
1552
+ "logging_steps": 5,
1553
+ "max_steps": 3585,
1554
+ "num_input_tokens_seen": 0,
1555
+ "num_train_epochs": 3,
1556
+ "save_steps": 100,
1557
+ "stateful_callbacks": {
1558
+ "TrainerControl": {
1559
+ "args": {
1560
+ "should_epoch_stop": false,
1561
+ "should_evaluate": false,
1562
+ "should_log": false,
1563
+ "should_save": true,
1564
+ "should_training_stop": false
1565
+ },
1566
+ "attributes": {}
1567
+ }
1568
+ },
1569
+ "total_flos": 1.4234917056302285e+18,
1570
+ "train_batch_size": 2,
1571
+ "trial_name": null,
1572
+ "trial_params": null
1573
+ }
checkpoint-1100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ad489884f87ec96cc1e49b25622db5c6c3c1eafcad1be5306265a7460b6619a
3
+ size 5304
checkpoint-1200/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.3
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
checkpoint-1200/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
checkpoint-1200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d19254861b1a6d2b796dc92ac6c0652e2f8a2da9421ace8c72f7f2f28114a2
3
+ size 13648432
checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44b6d02692d1e8eedb51d4521e1e229bf559f1f5c78e18827baa34ef6b6244d7
3
+ size 27370618
checkpoint-1200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff264f99d31b522cc7e2a4eac9d38606d0c58a34c0adc74d71e0ca8b371dc36
3
+ size 14244
checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c748c565b326f1420d0de27e5cc267bde896ddc4ed07b6f53c713ba343ded2
3
+ size 1064
checkpoint-1200/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-1200/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1200/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
checkpoint-1200/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,1713 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.003659174072138,
5
+ "eval_steps": 500,
6
+ "global_step": 1200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.004181913225300575,
13
+ "grad_norm": 3.3295910358428955,
14
+ "learning_rate": 4.9999760022374266e-05,
15
+ "loss": 1.8032,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.00836382645060115,
20
+ "grad_norm": 2.002122163772583,
21
+ "learning_rate": 4.999904009410418e-05,
22
+ "loss": 1.5943,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.012545739675901725,
27
+ "grad_norm": 1.9293591976165771,
28
+ "learning_rate": 4.9997840229011085e-05,
29
+ "loss": 1.5378,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.0167276529012023,
34
+ "grad_norm": 1.936969518661499,
35
+ "learning_rate": 4.999616045013025e-05,
36
+ "loss": 1.5856,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.020909566126502875,
41
+ "grad_norm": 1.8237314224243164,
42
+ "learning_rate": 4.9994000789710415e-05,
43
+ "loss": 1.5066,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.02509147935180345,
48
+ "grad_norm": 1.8101543188095093,
49
+ "learning_rate": 4.9991361289213203e-05,
50
+ "loss": 1.4499,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.029273392577104027,
55
+ "grad_norm": 1.647802710533142,
56
+ "learning_rate": 4.998824199931228e-05,
57
+ "loss": 1.4694,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.0334553058024046,
62
+ "grad_norm": 1.7077428102493286,
63
+ "learning_rate": 4.998464297989245e-05,
64
+ "loss": 1.4945,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.03763721902770518,
69
+ "grad_norm": 1.7119005918502808,
70
+ "learning_rate": 4.998056430004844e-05,
71
+ "loss": 1.4257,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.04181913225300575,
76
+ "grad_norm": 2.0229578018188477,
77
+ "learning_rate": 4.997600603808359e-05,
78
+ "loss": 1.4713,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.04600104547830632,
83
+ "grad_norm": 1.885362148284912,
84
+ "learning_rate": 4.997096828150838e-05,
85
+ "loss": 1.5145,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.0501829587036069,
90
+ "grad_norm": 2.0357539653778076,
91
+ "learning_rate": 4.9965451127038714e-05,
92
+ "loss": 1.3811,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.054364871928907474,
97
+ "grad_norm": 1.9780900478363037,
98
+ "learning_rate": 4.9959454680594086e-05,
99
+ "loss": 1.412,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.05854678515420805,
104
+ "grad_norm": 1.6695700883865356,
105
+ "learning_rate": 4.995297905729554e-05,
106
+ "loss": 1.4093,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.06272869837950862,
111
+ "grad_norm": 1.9424223899841309,
112
+ "learning_rate": 4.994602438146344e-05,
113
+ "loss": 1.4615,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.0669106116048092,
118
+ "grad_norm": 1.9082552194595337,
119
+ "learning_rate": 4.9938590786615126e-05,
120
+ "loss": 1.4169,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.07109252483010978,
125
+ "grad_norm": 1.8944685459136963,
126
+ "learning_rate": 4.993067841546231e-05,
127
+ "loss": 1.3064,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.07527443805541036,
132
+ "grad_norm": 1.9688103199005127,
133
+ "learning_rate": 4.992228741990834e-05,
134
+ "loss": 1.3778,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.07945635128071092,
139
+ "grad_norm": 2.0939650535583496,
140
+ "learning_rate": 4.991341796104534e-05,
141
+ "loss": 1.3905,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.0836382645060115,
146
+ "grad_norm": 1.868986964225769,
147
+ "learning_rate": 4.9904070209151015e-05,
148
+ "loss": 1.3723,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 0.08782017773131208,
153
+ "grad_norm": 2.069476842880249,
154
+ "learning_rate": 4.989424434368549e-05,
155
+ "loss": 1.4046,
156
+ "step": 105
157
+ },
158
+ {
159
+ "epoch": 0.09200209095661264,
160
+ "grad_norm": 2.1500356197357178,
161
+ "learning_rate": 4.988394055328779e-05,
162
+ "loss": 1.4319,
163
+ "step": 110
164
+ },
165
+ {
166
+ "epoch": 0.09618400418191322,
167
+ "grad_norm": 1.9786696434020996,
168
+ "learning_rate": 4.987315903577223e-05,
169
+ "loss": 1.4203,
170
+ "step": 115
171
+ },
172
+ {
173
+ "epoch": 0.1003659174072138,
174
+ "grad_norm": 2.035172939300537,
175
+ "learning_rate": 4.986189999812468e-05,
176
+ "loss": 1.4046,
177
+ "step": 120
178
+ },
179
+ {
180
+ "epoch": 0.10454783063251437,
181
+ "grad_norm": 1.9887930154800415,
182
+ "learning_rate": 4.985016365649848e-05,
183
+ "loss": 1.3836,
184
+ "step": 125
185
+ },
186
+ {
187
+ "epoch": 0.10872974385781495,
188
+ "grad_norm": 1.8339554071426392,
189
+ "learning_rate": 4.983795023621041e-05,
190
+ "loss": 1.3468,
191
+ "step": 130
192
+ },
193
+ {
194
+ "epoch": 0.11291165708311553,
195
+ "grad_norm": 2.06648588180542,
196
+ "learning_rate": 4.982525997173625e-05,
197
+ "loss": 1.3481,
198
+ "step": 135
199
+ },
200
+ {
201
+ "epoch": 0.1170935703084161,
202
+ "grad_norm": 2.1179702281951904,
203
+ "learning_rate": 4.9812093106706376e-05,
204
+ "loss": 1.4475,
205
+ "step": 140
206
+ },
207
+ {
208
+ "epoch": 0.12127548353371667,
209
+ "grad_norm": 2.1708436012268066,
210
+ "learning_rate": 4.979844989390104e-05,
211
+ "loss": 1.3662,
212
+ "step": 145
213
+ },
214
+ {
215
+ "epoch": 0.12545739675901724,
216
+ "grad_norm": 2.0533807277679443,
217
+ "learning_rate": 4.978433059524548e-05,
218
+ "loss": 1.3856,
219
+ "step": 150
220
+ },
221
+ {
222
+ "epoch": 0.12963930998431783,
223
+ "grad_norm": 1.9418696165084839,
224
+ "learning_rate": 4.976973548180498e-05,
225
+ "loss": 1.357,
226
+ "step": 155
227
+ },
228
+ {
229
+ "epoch": 0.1338212232096184,
230
+ "grad_norm": 2.1071596145629883,
231
+ "learning_rate": 4.975466483377959e-05,
232
+ "loss": 1.3613,
233
+ "step": 160
234
+ },
235
+ {
236
+ "epoch": 0.138003136434919,
237
+ "grad_norm": 2.3704464435577393,
238
+ "learning_rate": 4.9739118940498766e-05,
239
+ "loss": 1.3985,
240
+ "step": 165
241
+ },
242
+ {
243
+ "epoch": 0.14218504966021955,
244
+ "grad_norm": 2.1726176738739014,
245
+ "learning_rate": 4.9723098100415844e-05,
246
+ "loss": 1.4361,
247
+ "step": 170
248
+ },
249
+ {
250
+ "epoch": 0.14636696288552012,
251
+ "grad_norm": 2.608517646789551,
252
+ "learning_rate": 4.970660262110227e-05,
253
+ "loss": 1.3565,
254
+ "step": 175
255
+ },
256
+ {
257
+ "epoch": 0.1505488761108207,
258
+ "grad_norm": 2.185563564300537,
259
+ "learning_rate": 4.968963281924173e-05,
260
+ "loss": 1.2957,
261
+ "step": 180
262
+ },
263
+ {
264
+ "epoch": 0.15473078933612128,
265
+ "grad_norm": 1.9632441997528076,
266
+ "learning_rate": 4.967218902062403e-05,
267
+ "loss": 1.2833,
268
+ "step": 185
269
+ },
270
+ {
271
+ "epoch": 0.15891270256142184,
272
+ "grad_norm": 2.303462028503418,
273
+ "learning_rate": 4.96542715601389e-05,
274
+ "loss": 1.3238,
275
+ "step": 190
276
+ },
277
+ {
278
+ "epoch": 0.16309461578672244,
279
+ "grad_norm": 2.3211796283721924,
280
+ "learning_rate": 4.9635880781769495e-05,
281
+ "loss": 1.3848,
282
+ "step": 195
283
+ },
284
+ {
285
+ "epoch": 0.167276529012023,
286
+ "grad_norm": 2.0993776321411133,
287
+ "learning_rate": 4.961701703858584e-05,
288
+ "loss": 1.3997,
289
+ "step": 200
290
+ },
291
+ {
292
+ "epoch": 0.17145844223732357,
293
+ "grad_norm": 2.210942506790161,
294
+ "learning_rate": 4.9597680692738056e-05,
295
+ "loss": 1.334,
296
+ "step": 205
297
+ },
298
+ {
299
+ "epoch": 0.17564035546262416,
300
+ "grad_norm": 2.1277225017547607,
301
+ "learning_rate": 4.957787211544935e-05,
302
+ "loss": 1.3338,
303
+ "step": 210
304
+ },
305
+ {
306
+ "epoch": 0.17982226868792472,
307
+ "grad_norm": 2.287722587585449,
308
+ "learning_rate": 4.9557591687008966e-05,
309
+ "loss": 1.3621,
310
+ "step": 215
311
+ },
312
+ {
313
+ "epoch": 0.1840041819132253,
314
+ "grad_norm": 2.113842010498047,
315
+ "learning_rate": 4.9536839796764825e-05,
316
+ "loss": 1.3808,
317
+ "step": 220
318
+ },
319
+ {
320
+ "epoch": 0.18818609513852588,
321
+ "grad_norm": 2.0642735958099365,
322
+ "learning_rate": 4.951561684311608e-05,
323
+ "loss": 1.3429,
324
+ "step": 225
325
+ },
326
+ {
327
+ "epoch": 0.19236800836382645,
328
+ "grad_norm": 2.2026379108428955,
329
+ "learning_rate": 4.9493923233505435e-05,
330
+ "loss": 1.2855,
331
+ "step": 230
332
+ },
333
+ {
334
+ "epoch": 0.196549921589127,
335
+ "grad_norm": 2.2843308448791504,
336
+ "learning_rate": 4.947175938441138e-05,
337
+ "loss": 1.3432,
338
+ "step": 235
339
+ },
340
+ {
341
+ "epoch": 0.2007318348144276,
342
+ "grad_norm": 2.126749038696289,
343
+ "learning_rate": 4.9449125721340145e-05,
344
+ "loss": 1.3753,
345
+ "step": 240
346
+ },
347
+ {
348
+ "epoch": 0.20491374803972817,
349
+ "grad_norm": 2.514599561691284,
350
+ "learning_rate": 4.942602267881755e-05,
351
+ "loss": 1.3101,
352
+ "step": 245
353
+ },
354
+ {
355
+ "epoch": 0.20909566126502874,
356
+ "grad_norm": 2.2043542861938477,
357
+ "learning_rate": 4.940245070038064e-05,
358
+ "loss": 1.3395,
359
+ "step": 250
360
+ },
361
+ {
362
+ "epoch": 0.21327757449032933,
363
+ "grad_norm": 2.152742862701416,
364
+ "learning_rate": 4.937841023856923e-05,
365
+ "loss": 1.3188,
366
+ "step": 255
367
+ },
368
+ {
369
+ "epoch": 0.2174594877156299,
370
+ "grad_norm": 2.221057415008545,
371
+ "learning_rate": 4.935390175491716e-05,
372
+ "loss": 1.4205,
373
+ "step": 260
374
+ },
375
+ {
376
+ "epoch": 0.2216414009409305,
377
+ "grad_norm": 2.600109100341797,
378
+ "learning_rate": 4.932892571994342e-05,
379
+ "loss": 1.3499,
380
+ "step": 265
381
+ },
382
+ {
383
+ "epoch": 0.22582331416623105,
384
+ "grad_norm": 2.2291276454925537,
385
+ "learning_rate": 4.9303482613143194e-05,
386
+ "loss": 1.3984,
387
+ "step": 270
388
+ },
389
+ {
390
+ "epoch": 0.23000522739153162,
391
+ "grad_norm": 2.338000774383545,
392
+ "learning_rate": 4.9277572922978586e-05,
393
+ "loss": 1.344,
394
+ "step": 275
395
+ },
396
+ {
397
+ "epoch": 0.2341871406168322,
398
+ "grad_norm": 2.3423988819122314,
399
+ "learning_rate": 4.925119714686928e-05,
400
+ "loss": 1.2696,
401
+ "step": 280
402
+ },
403
+ {
404
+ "epoch": 0.23836905384213278,
405
+ "grad_norm": 2.433870553970337,
406
+ "learning_rate": 4.9224355791182955e-05,
407
+ "loss": 1.3903,
408
+ "step": 285
409
+ },
410
+ {
411
+ "epoch": 0.24255096706743334,
412
+ "grad_norm": 2.281836986541748,
413
+ "learning_rate": 4.919704937122559e-05,
414
+ "loss": 1.3694,
415
+ "step": 290
416
+ },
417
+ {
418
+ "epoch": 0.24673288029273394,
419
+ "grad_norm": 2.2429168224334717,
420
+ "learning_rate": 4.916927841123159e-05,
421
+ "loss": 1.3541,
422
+ "step": 295
423
+ },
424
+ {
425
+ "epoch": 0.2509147935180345,
426
+ "grad_norm": 2.4559881687164307,
427
+ "learning_rate": 4.9141043444353674e-05,
428
+ "loss": 1.3795,
429
+ "step": 300
430
+ },
431
+ {
432
+ "epoch": 0.25509670674333507,
433
+ "grad_norm": 2.7529919147491455,
434
+ "learning_rate": 4.911234501265266e-05,
435
+ "loss": 1.386,
436
+ "step": 305
437
+ },
438
+ {
439
+ "epoch": 0.25927861996863566,
440
+ "grad_norm": 2.229973554611206,
441
+ "learning_rate": 4.9083183667087064e-05,
442
+ "loss": 1.3653,
443
+ "step": 310
444
+ },
445
+ {
446
+ "epoch": 0.26346053319393625,
447
+ "grad_norm": 2.38677716255188,
448
+ "learning_rate": 4.9053559967502535e-05,
449
+ "loss": 1.3254,
450
+ "step": 315
451
+ },
452
+ {
453
+ "epoch": 0.2676424464192368,
454
+ "grad_norm": 2.292898416519165,
455
+ "learning_rate": 4.9023474482621075e-05,
456
+ "loss": 1.3756,
457
+ "step": 320
458
+ },
459
+ {
460
+ "epoch": 0.2718243596445374,
461
+ "grad_norm": 2.08819317817688,
462
+ "learning_rate": 4.899292779003014e-05,
463
+ "loss": 1.3286,
464
+ "step": 325
465
+ },
466
+ {
467
+ "epoch": 0.276006272869838,
468
+ "grad_norm": 2.1810929775238037,
469
+ "learning_rate": 4.896192047617156e-05,
470
+ "loss": 1.3884,
471
+ "step": 330
472
+ },
473
+ {
474
+ "epoch": 0.2801881860951385,
475
+ "grad_norm": 2.2136948108673096,
476
+ "learning_rate": 4.893045313633025e-05,
477
+ "loss": 1.2723,
478
+ "step": 335
479
+ },
480
+ {
481
+ "epoch": 0.2843700993204391,
482
+ "grad_norm": 2.3611645698547363,
483
+ "learning_rate": 4.8898526374622815e-05,
484
+ "loss": 1.3758,
485
+ "step": 340
486
+ },
487
+ {
488
+ "epoch": 0.2885520125457397,
489
+ "grad_norm": 2.2939200401306152,
490
+ "learning_rate": 4.886614080398594e-05,
491
+ "loss": 1.3727,
492
+ "step": 345
493
+ },
494
+ {
495
+ "epoch": 0.29273392577104024,
496
+ "grad_norm": 2.2382090091705322,
497
+ "learning_rate": 4.8833297046164594e-05,
498
+ "loss": 1.3412,
499
+ "step": 350
500
+ },
501
+ {
502
+ "epoch": 0.29691583899634083,
503
+ "grad_norm": 2.0420024394989014,
504
+ "learning_rate": 4.8799995731700155e-05,
505
+ "loss": 1.3378,
506
+ "step": 355
507
+ },
508
+ {
509
+ "epoch": 0.3010977522216414,
510
+ "grad_norm": 2.180814266204834,
511
+ "learning_rate": 4.8766237499918244e-05,
512
+ "loss": 1.3305,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 0.30527966544694196,
517
+ "grad_norm": 2.641951560974121,
518
+ "learning_rate": 4.873202299891649e-05,
519
+ "loss": 1.3084,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 0.30946157867224255,
524
+ "grad_norm": 2.421492099761963,
525
+ "learning_rate": 4.8697352885552077e-05,
526
+ "loss": 1.3321,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 0.31364349189754315,
531
+ "grad_norm": 2.1777994632720947,
532
+ "learning_rate": 4.866222782542912e-05,
533
+ "loss": 1.3605,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 0.3178254051228437,
538
+ "grad_norm": 2.097909450531006,
539
+ "learning_rate": 4.862664849288589e-05,
540
+ "loss": 1.3786,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 0.3220073183481443,
545
+ "grad_norm": 2.3216776847839355,
546
+ "learning_rate": 4.8590615570981904e-05,
547
+ "loss": 1.3467,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 0.32618923157344487,
552
+ "grad_norm": 2.199545383453369,
553
+ "learning_rate": 4.855412975148475e-05,
554
+ "loss": 1.3534,
555
+ "step": 390
556
+ },
557
+ {
558
+ "epoch": 0.3303711447987454,
559
+ "grad_norm": 2.4035983085632324,
560
+ "learning_rate": 4.851719173485686e-05,
561
+ "loss": 1.3545,
562
+ "step": 395
563
+ },
564
+ {
565
+ "epoch": 0.334553058024046,
566
+ "grad_norm": 2.130056142807007,
567
+ "learning_rate": 4.847980223024205e-05,
568
+ "loss": 1.3574,
569
+ "step": 400
570
+ },
571
+ {
572
+ "epoch": 0.3387349712493466,
573
+ "grad_norm": 2.2835471630096436,
574
+ "learning_rate": 4.8441961955451865e-05,
575
+ "loss": 1.2835,
576
+ "step": 405
577
+ },
578
+ {
579
+ "epoch": 0.34291688447464713,
580
+ "grad_norm": 2.129136085510254,
581
+ "learning_rate": 4.840367163695186e-05,
582
+ "loss": 1.3066,
583
+ "step": 410
584
+ },
585
+ {
586
+ "epoch": 0.3470987976999477,
587
+ "grad_norm": 2.199307680130005,
588
+ "learning_rate": 4.8364932009847614e-05,
589
+ "loss": 1.3891,
590
+ "step": 415
591
+ },
592
+ {
593
+ "epoch": 0.3512807109252483,
594
+ "grad_norm": 2.240877866744995,
595
+ "learning_rate": 4.8325743817870614e-05,
596
+ "loss": 1.3131,
597
+ "step": 420
598
+ },
599
+ {
600
+ "epoch": 0.35546262415054886,
601
+ "grad_norm": 2.4342916011810303,
602
+ "learning_rate": 4.8286107813364015e-05,
603
+ "loss": 1.2903,
604
+ "step": 425
605
+ },
606
+ {
607
+ "epoch": 0.35964453737584945,
608
+ "grad_norm": 2.323018789291382,
609
+ "learning_rate": 4.824602475726815e-05,
610
+ "loss": 1.3016,
611
+ "step": 430
612
+ },
613
+ {
614
+ "epoch": 0.36382645060115004,
615
+ "grad_norm": 2.2159998416900635,
616
+ "learning_rate": 4.820549541910595e-05,
617
+ "loss": 1.3312,
618
+ "step": 435
619
+ },
620
+ {
621
+ "epoch": 0.3680083638264506,
622
+ "grad_norm": 2.654232978820801,
623
+ "learning_rate": 4.8164520576968165e-05,
624
+ "loss": 1.3793,
625
+ "step": 440
626
+ },
627
+ {
628
+ "epoch": 0.3721902770517512,
629
+ "grad_norm": 2.3524513244628906,
630
+ "learning_rate": 4.8123101017498416e-05,
631
+ "loss": 1.3406,
632
+ "step": 445
633
+ },
634
+ {
635
+ "epoch": 0.37637219027705177,
636
+ "grad_norm": 2.3215346336364746,
637
+ "learning_rate": 4.8081237535878116e-05,
638
+ "loss": 1.3997,
639
+ "step": 450
640
+ },
641
+ {
642
+ "epoch": 0.3805541035023523,
643
+ "grad_norm": 2.0394012928009033,
644
+ "learning_rate": 4.803893093581117e-05,
645
+ "loss": 1.3084,
646
+ "step": 455
647
+ },
648
+ {
649
+ "epoch": 0.3847360167276529,
650
+ "grad_norm": 2.251598596572876,
651
+ "learning_rate": 4.799618202950857e-05,
652
+ "loss": 1.3396,
653
+ "step": 460
654
+ },
655
+ {
656
+ "epoch": 0.3889179299529535,
657
+ "grad_norm": 2.4240410327911377,
658
+ "learning_rate": 4.795299163767282e-05,
659
+ "loss": 1.359,
660
+ "step": 465
661
+ },
662
+ {
663
+ "epoch": 0.393099843178254,
664
+ "grad_norm": 2.2180769443511963,
665
+ "learning_rate": 4.790936058948211e-05,
666
+ "loss": 1.3575,
667
+ "step": 470
668
+ },
669
+ {
670
+ "epoch": 0.3972817564035546,
671
+ "grad_norm": 2.2782742977142334,
672
+ "learning_rate": 4.786528972257449e-05,
673
+ "loss": 1.2915,
674
+ "step": 475
675
+ },
676
+ {
677
+ "epoch": 0.4014636696288552,
678
+ "grad_norm": 2.2193851470947266,
679
+ "learning_rate": 4.7820779883031696e-05,
680
+ "loss": 1.3185,
681
+ "step": 480
682
+ },
683
+ {
684
+ "epoch": 0.40564558285415575,
685
+ "grad_norm": 2.2422704696655273,
686
+ "learning_rate": 4.7775831925363e-05,
687
+ "loss": 1.2638,
688
+ "step": 485
689
+ },
690
+ {
691
+ "epoch": 0.40982749607945634,
692
+ "grad_norm": 2.430154323577881,
693
+ "learning_rate": 4.773044671248872e-05,
694
+ "loss": 1.3616,
695
+ "step": 490
696
+ },
697
+ {
698
+ "epoch": 0.41400940930475694,
699
+ "grad_norm": 2.2115910053253174,
700
+ "learning_rate": 4.768462511572371e-05,
701
+ "loss": 1.35,
702
+ "step": 495
703
+ },
704
+ {
705
+ "epoch": 0.4181913225300575,
706
+ "grad_norm": 2.063385486602783,
707
+ "learning_rate": 4.763836801476061e-05,
708
+ "loss": 1.2797,
709
+ "step": 500
710
+ },
711
+ {
712
+ "epoch": 0.42237323575535807,
713
+ "grad_norm": 2.3508410453796387,
714
+ "learning_rate": 4.759167629765297e-05,
715
+ "loss": 1.3173,
716
+ "step": 505
717
+ },
718
+ {
719
+ "epoch": 0.42655514898065866,
720
+ "grad_norm": 2.3848047256469727,
721
+ "learning_rate": 4.7544550860798177e-05,
722
+ "loss": 1.2502,
723
+ "step": 510
724
+ },
725
+ {
726
+ "epoch": 0.43073706220595925,
727
+ "grad_norm": 2.227524757385254,
728
+ "learning_rate": 4.749699260892026e-05,
729
+ "loss": 1.3174,
730
+ "step": 515
731
+ },
732
+ {
733
+ "epoch": 0.4349189754312598,
734
+ "grad_norm": 2.2080676555633545,
735
+ "learning_rate": 4.744900245505253e-05,
736
+ "loss": 1.2567,
737
+ "step": 520
738
+ },
739
+ {
740
+ "epoch": 0.4391008886565604,
741
+ "grad_norm": 2.2167842388153076,
742
+ "learning_rate": 4.7400581320520055e-05,
743
+ "loss": 1.2906,
744
+ "step": 525
745
+ },
746
+ {
747
+ "epoch": 0.443282801881861,
748
+ "grad_norm": 2.4347431659698486,
749
+ "learning_rate": 4.735173013492193e-05,
750
+ "loss": 1.3109,
751
+ "step": 530
752
+ },
753
+ {
754
+ "epoch": 0.4474647151071615,
755
+ "grad_norm": 2.024902105331421,
756
+ "learning_rate": 4.73024498361135e-05,
757
+ "loss": 1.3265,
758
+ "step": 535
759
+ },
760
+ {
761
+ "epoch": 0.4516466283324621,
762
+ "grad_norm": 2.1333351135253906,
763
+ "learning_rate": 4.725274137018826e-05,
764
+ "loss": 1.3426,
765
+ "step": 540
766
+ },
767
+ {
768
+ "epoch": 0.4558285415577627,
769
+ "grad_norm": 2.400423049926758,
770
+ "learning_rate": 4.720260569145981e-05,
771
+ "loss": 1.2751,
772
+ "step": 545
773
+ },
774
+ {
775
+ "epoch": 0.46001045478306324,
776
+ "grad_norm": 2.5796918869018555,
777
+ "learning_rate": 4.715204376244343e-05,
778
+ "loss": 1.3453,
779
+ "step": 550
780
+ },
781
+ {
782
+ "epoch": 0.46419236800836383,
783
+ "grad_norm": 2.339895486831665,
784
+ "learning_rate": 4.7101056553837665e-05,
785
+ "loss": 1.2667,
786
+ "step": 555
787
+ },
788
+ {
789
+ "epoch": 0.4683742812336644,
790
+ "grad_norm": 2.273618221282959,
791
+ "learning_rate": 4.704964504450563e-05,
792
+ "loss": 1.3264,
793
+ "step": 560
794
+ },
795
+ {
796
+ "epoch": 0.47255619445896496,
797
+ "grad_norm": 1.9942044019699097,
798
+ "learning_rate": 4.69978102214563e-05,
799
+ "loss": 1.2907,
800
+ "step": 565
801
+ },
802
+ {
803
+ "epoch": 0.47673810768426556,
804
+ "grad_norm": 2.1848580837249756,
805
+ "learning_rate": 4.694555307982551e-05,
806
+ "loss": 1.2921,
807
+ "step": 570
808
+ },
809
+ {
810
+ "epoch": 0.48092002090956615,
811
+ "grad_norm": 2.2914299964904785,
812
+ "learning_rate": 4.689287462285681e-05,
813
+ "loss": 1.3612,
814
+ "step": 575
815
+ },
816
+ {
817
+ "epoch": 0.4851019341348667,
818
+ "grad_norm": 2.1165218353271484,
819
+ "learning_rate": 4.6839775861882306e-05,
820
+ "loss": 1.3419,
821
+ "step": 580
822
+ },
823
+ {
824
+ "epoch": 0.4892838473601673,
825
+ "grad_norm": 2.003962516784668,
826
+ "learning_rate": 4.678625781630315e-05,
827
+ "loss": 1.388,
828
+ "step": 585
829
+ },
830
+ {
831
+ "epoch": 0.49346576058546787,
832
+ "grad_norm": 2.366339683532715,
833
+ "learning_rate": 4.673232151357004e-05,
834
+ "loss": 1.2895,
835
+ "step": 590
836
+ },
837
+ {
838
+ "epoch": 0.4976476738107684,
839
+ "grad_norm": 2.4352052211761475,
840
+ "learning_rate": 4.667796798916343e-05,
841
+ "loss": 1.3457,
842
+ "step": 595
843
+ },
844
+ {
845
+ "epoch": 0.501829587036069,
846
+ "grad_norm": 2.4221272468566895,
847
+ "learning_rate": 4.662319828657371e-05,
848
+ "loss": 1.3298,
849
+ "step": 600
850
+ },
851
+ {
852
+ "epoch": 0.5060115002613695,
853
+ "grad_norm": 2.2551608085632324,
854
+ "learning_rate": 4.6568013457281126e-05,
855
+ "loss": 1.2192,
856
+ "step": 605
857
+ },
858
+ {
859
+ "epoch": 0.5101934134866701,
860
+ "grad_norm": 2.4560656547546387,
861
+ "learning_rate": 4.651241456073563e-05,
862
+ "loss": 1.3086,
863
+ "step": 610
864
+ },
865
+ {
866
+ "epoch": 0.5143753267119707,
867
+ "grad_norm": 2.266500473022461,
868
+ "learning_rate": 4.645640266433651e-05,
869
+ "loss": 1.2608,
870
+ "step": 615
871
+ },
872
+ {
873
+ "epoch": 0.5185572399372713,
874
+ "grad_norm": 2.175009250640869,
875
+ "learning_rate": 4.639997884341192e-05,
876
+ "loss": 1.3459,
877
+ "step": 620
878
+ },
879
+ {
880
+ "epoch": 0.5227391531625719,
881
+ "grad_norm": 2.329770803451538,
882
+ "learning_rate": 4.634314418119823e-05,
883
+ "loss": 1.2894,
884
+ "step": 625
885
+ },
886
+ {
887
+ "epoch": 0.5269210663878725,
888
+ "grad_norm": 2.545764446258545,
889
+ "learning_rate": 4.628589976881923e-05,
890
+ "loss": 1.3213,
891
+ "step": 630
892
+ },
893
+ {
894
+ "epoch": 0.531102979613173,
895
+ "grad_norm": 2.2759323120117188,
896
+ "learning_rate": 4.622824670526516e-05,
897
+ "loss": 1.3362,
898
+ "step": 635
899
+ },
900
+ {
901
+ "epoch": 0.5352848928384736,
902
+ "grad_norm": 2.341280221939087,
903
+ "learning_rate": 4.617018609737166e-05,
904
+ "loss": 1.24,
905
+ "step": 640
906
+ },
907
+ {
908
+ "epoch": 0.5394668060637742,
909
+ "grad_norm": 2.353301763534546,
910
+ "learning_rate": 4.6111719059798466e-05,
911
+ "loss": 1.3643,
912
+ "step": 645
913
+ },
914
+ {
915
+ "epoch": 0.5436487192890748,
916
+ "grad_norm": 2.223389148712158,
917
+ "learning_rate": 4.605284671500805e-05,
918
+ "loss": 1.3569,
919
+ "step": 650
920
+ },
921
+ {
922
+ "epoch": 0.5478306325143754,
923
+ "grad_norm": 2.2959463596343994,
924
+ "learning_rate": 4.599357019324405e-05,
925
+ "loss": 1.3581,
926
+ "step": 655
927
+ },
928
+ {
929
+ "epoch": 0.552012545739676,
930
+ "grad_norm": 2.1995513439178467,
931
+ "learning_rate": 4.593389063250958e-05,
932
+ "loss": 1.2962,
933
+ "step": 660
934
+ },
935
+ {
936
+ "epoch": 0.5561944589649764,
937
+ "grad_norm": 1.9802824258804321,
938
+ "learning_rate": 4.5873809178545396e-05,
939
+ "loss": 1.3121,
940
+ "step": 665
941
+ },
942
+ {
943
+ "epoch": 0.560376372190277,
944
+ "grad_norm": 2.2561113834381104,
945
+ "learning_rate": 4.581332698480786e-05,
946
+ "loss": 1.2395,
947
+ "step": 670
948
+ },
949
+ {
950
+ "epoch": 0.5645582854155776,
951
+ "grad_norm": 2.1277997493743896,
952
+ "learning_rate": 4.5752445212446836e-05,
953
+ "loss": 1.2782,
954
+ "step": 675
955
+ },
956
+ {
957
+ "epoch": 0.5687401986408782,
958
+ "grad_norm": 2.1406946182250977,
959
+ "learning_rate": 4.569116503028339e-05,
960
+ "loss": 1.3248,
961
+ "step": 680
962
+ },
963
+ {
964
+ "epoch": 0.5729221118661788,
965
+ "grad_norm": 2.190600872039795,
966
+ "learning_rate": 4.5629487614787306e-05,
967
+ "loss": 1.2975,
968
+ "step": 685
969
+ },
970
+ {
971
+ "epoch": 0.5771040250914794,
972
+ "grad_norm": 2.1068780422210693,
973
+ "learning_rate": 4.556741415005459e-05,
974
+ "loss": 1.3191,
975
+ "step": 690
976
+ },
977
+ {
978
+ "epoch": 0.5812859383167799,
979
+ "grad_norm": 2.3496341705322266,
980
+ "learning_rate": 4.5504945827784634e-05,
981
+ "loss": 1.2827,
982
+ "step": 695
983
+ },
984
+ {
985
+ "epoch": 0.5854678515420805,
986
+ "grad_norm": 2.33406400680542,
987
+ "learning_rate": 4.544208384725742e-05,
988
+ "loss": 1.2649,
989
+ "step": 700
990
+ },
991
+ {
992
+ "epoch": 0.5896497647673811,
993
+ "grad_norm": 2.365461587905884,
994
+ "learning_rate": 4.5378829415310465e-05,
995
+ "loss": 1.3121,
996
+ "step": 705
997
+ },
998
+ {
999
+ "epoch": 0.5938316779926817,
1000
+ "grad_norm": 2.186875343322754,
1001
+ "learning_rate": 4.531518374631564e-05,
1002
+ "loss": 1.3113,
1003
+ "step": 710
1004
+ },
1005
+ {
1006
+ "epoch": 0.5980135912179823,
1007
+ "grad_norm": 2.1798176765441895,
1008
+ "learning_rate": 4.525114806215584e-05,
1009
+ "loss": 1.3446,
1010
+ "step": 715
1011
+ },
1012
+ {
1013
+ "epoch": 0.6021955044432828,
1014
+ "grad_norm": 1.9714925289154053,
1015
+ "learning_rate": 4.518672359220161e-05,
1016
+ "loss": 1.2979,
1017
+ "step": 720
1018
+ },
1019
+ {
1020
+ "epoch": 0.6063774176685833,
1021
+ "grad_norm": 2.1850643157958984,
1022
+ "learning_rate": 4.5121911573287446e-05,
1023
+ "loss": 1.2412,
1024
+ "step": 725
1025
+ },
1026
+ {
1027
+ "epoch": 0.6105593308938839,
1028
+ "grad_norm": 2.0708961486816406,
1029
+ "learning_rate": 4.505671324968811e-05,
1030
+ "loss": 1.2559,
1031
+ "step": 730
1032
+ },
1033
+ {
1034
+ "epoch": 0.6147412441191845,
1035
+ "grad_norm": 2.461225986480713,
1036
+ "learning_rate": 4.49911298730947e-05,
1037
+ "loss": 1.3696,
1038
+ "step": 735
1039
+ },
1040
+ {
1041
+ "epoch": 0.6189231573444851,
1042
+ "grad_norm": 2.2155981063842773,
1043
+ "learning_rate": 4.492516270259066e-05,
1044
+ "loss": 1.3459,
1045
+ "step": 740
1046
+ },
1047
+ {
1048
+ "epoch": 0.6231050705697857,
1049
+ "grad_norm": 2.0899972915649414,
1050
+ "learning_rate": 4.48588130046276e-05,
1051
+ "loss": 1.2885,
1052
+ "step": 745
1053
+ },
1054
+ {
1055
+ "epoch": 0.6272869837950863,
1056
+ "grad_norm": 2.2798969745635986,
1057
+ "learning_rate": 4.479208205300094e-05,
1058
+ "loss": 1.299,
1059
+ "step": 750
1060
+ },
1061
+ {
1062
+ "epoch": 0.6314688970203868,
1063
+ "grad_norm": 2.4617793560028076,
1064
+ "learning_rate": 4.472497112882552e-05,
1065
+ "loss": 1.2973,
1066
+ "step": 755
1067
+ },
1068
+ {
1069
+ "epoch": 0.6356508102456874,
1070
+ "grad_norm": 2.20271635055542,
1071
+ "learning_rate": 4.465748152051096e-05,
1072
+ "loss": 1.231,
1073
+ "step": 760
1074
+ },
1075
+ {
1076
+ "epoch": 0.639832723470988,
1077
+ "grad_norm": 1.990775227546692,
1078
+ "learning_rate": 4.458961452373692e-05,
1079
+ "loss": 1.2594,
1080
+ "step": 765
1081
+ },
1082
+ {
1083
+ "epoch": 0.6440146366962886,
1084
+ "grad_norm": 2.2898495197296143,
1085
+ "learning_rate": 4.4521371441428284e-05,
1086
+ "loss": 1.2721,
1087
+ "step": 770
1088
+ },
1089
+ {
1090
+ "epoch": 0.6481965499215891,
1091
+ "grad_norm": 2.282844066619873,
1092
+ "learning_rate": 4.445275358373006e-05,
1093
+ "loss": 1.2742,
1094
+ "step": 775
1095
+ },
1096
+ {
1097
+ "epoch": 0.6523784631468897,
1098
+ "grad_norm": 2.3958911895751953,
1099
+ "learning_rate": 4.438376226798231e-05,
1100
+ "loss": 1.2897,
1101
+ "step": 780
1102
+ },
1103
+ {
1104
+ "epoch": 0.6565603763721902,
1105
+ "grad_norm": 2.2609405517578125,
1106
+ "learning_rate": 4.43143988186948e-05,
1107
+ "loss": 1.2891,
1108
+ "step": 785
1109
+ },
1110
+ {
1111
+ "epoch": 0.6607422895974908,
1112
+ "grad_norm": 2.416956663131714,
1113
+ "learning_rate": 4.42446645675216e-05,
1114
+ "loss": 1.3711,
1115
+ "step": 790
1116
+ },
1117
+ {
1118
+ "epoch": 0.6649242028227914,
1119
+ "grad_norm": 2.280547857284546,
1120
+ "learning_rate": 4.4174560853235505e-05,
1121
+ "loss": 1.3413,
1122
+ "step": 795
1123
+ },
1124
+ {
1125
+ "epoch": 0.669106116048092,
1126
+ "grad_norm": 2.055990695953369,
1127
+ "learning_rate": 4.410408902170235e-05,
1128
+ "loss": 1.2829,
1129
+ "step": 800
1130
+ },
1131
+ {
1132
+ "epoch": 0.6732880292733926,
1133
+ "grad_norm": 2.1436378955841064,
1134
+ "learning_rate": 4.403325042585518e-05,
1135
+ "loss": 1.2223,
1136
+ "step": 805
1137
+ },
1138
+ {
1139
+ "epoch": 0.6774699424986932,
1140
+ "grad_norm": 2.1957428455352783,
1141
+ "learning_rate": 4.396204642566821e-05,
1142
+ "loss": 1.2923,
1143
+ "step": 810
1144
+ },
1145
+ {
1146
+ "epoch": 0.6816518557239938,
1147
+ "grad_norm": 2.1543030738830566,
1148
+ "learning_rate": 4.389047838813082e-05,
1149
+ "loss": 1.2296,
1150
+ "step": 815
1151
+ },
1152
+ {
1153
+ "epoch": 0.6858337689492943,
1154
+ "grad_norm": 2.1455068588256836,
1155
+ "learning_rate": 4.3818547687221204e-05,
1156
+ "loss": 1.2913,
1157
+ "step": 820
1158
+ },
1159
+ {
1160
+ "epoch": 0.6900156821745949,
1161
+ "grad_norm": 2.2354376316070557,
1162
+ "learning_rate": 4.374625570388008e-05,
1163
+ "loss": 1.2647,
1164
+ "step": 825
1165
+ },
1166
+ {
1167
+ "epoch": 0.6941975953998955,
1168
+ "grad_norm": 2.2896392345428467,
1169
+ "learning_rate": 4.367360382598413e-05,
1170
+ "loss": 1.329,
1171
+ "step": 830
1172
+ },
1173
+ {
1174
+ "epoch": 0.698379508625196,
1175
+ "grad_norm": 2.0567495822906494,
1176
+ "learning_rate": 4.360059344831936e-05,
1177
+ "loss": 1.2306,
1178
+ "step": 835
1179
+ },
1180
+ {
1181
+ "epoch": 0.7025614218504966,
1182
+ "grad_norm": 2.0631346702575684,
1183
+ "learning_rate": 4.352722597255434e-05,
1184
+ "loss": 1.2788,
1185
+ "step": 840
1186
+ },
1187
+ {
1188
+ "epoch": 0.7067433350757972,
1189
+ "grad_norm": 2.3736605644226074,
1190
+ "learning_rate": 4.345350280721328e-05,
1191
+ "loss": 1.3236,
1192
+ "step": 845
1193
+ },
1194
+ {
1195
+ "epoch": 0.7109252483010977,
1196
+ "grad_norm": 2.308960437774658,
1197
+ "learning_rate": 4.337942536764901e-05,
1198
+ "loss": 1.2993,
1199
+ "step": 850
1200
+ },
1201
+ {
1202
+ "epoch": 0.7151071615263983,
1203
+ "grad_norm": 2.230039358139038,
1204
+ "learning_rate": 4.330499507601575e-05,
1205
+ "loss": 1.2544,
1206
+ "step": 855
1207
+ },
1208
+ {
1209
+ "epoch": 0.7192890747516989,
1210
+ "grad_norm": 2.1853036880493164,
1211
+ "learning_rate": 4.3230213361241894e-05,
1212
+ "loss": 1.2829,
1213
+ "step": 860
1214
+ },
1215
+ {
1216
+ "epoch": 0.7234709879769995,
1217
+ "grad_norm": 2.2859535217285156,
1218
+ "learning_rate": 4.3155081659002506e-05,
1219
+ "loss": 1.3128,
1220
+ "step": 865
1221
+ },
1222
+ {
1223
+ "epoch": 0.7276529012023001,
1224
+ "grad_norm": 1.9847872257232666,
1225
+ "learning_rate": 4.3079601411691775e-05,
1226
+ "loss": 1.3126,
1227
+ "step": 870
1228
+ },
1229
+ {
1230
+ "epoch": 0.7318348144276007,
1231
+ "grad_norm": 2.3204383850097656,
1232
+ "learning_rate": 4.3003774068395355e-05,
1233
+ "loss": 1.3275,
1234
+ "step": 875
1235
+ },
1236
+ {
1237
+ "epoch": 0.7360167276529012,
1238
+ "grad_norm": 2.1831467151641846,
1239
+ "learning_rate": 4.292760108486251e-05,
1240
+ "loss": 1.3232,
1241
+ "step": 880
1242
+ },
1243
+ {
1244
+ "epoch": 0.7401986408782018,
1245
+ "grad_norm": 2.241342544555664,
1246
+ "learning_rate": 4.2851083923478186e-05,
1247
+ "loss": 1.2484,
1248
+ "step": 885
1249
+ },
1250
+ {
1251
+ "epoch": 0.7443805541035023,
1252
+ "grad_norm": 2.335024118423462,
1253
+ "learning_rate": 4.27742240532349e-05,
1254
+ "loss": 1.2466,
1255
+ "step": 890
1256
+ },
1257
+ {
1258
+ "epoch": 0.7485624673288029,
1259
+ "grad_norm": 2.592941999435425,
1260
+ "learning_rate": 4.269702294970461e-05,
1261
+ "loss": 1.3111,
1262
+ "step": 895
1263
+ },
1264
+ {
1265
+ "epoch": 0.7527443805541035,
1266
+ "grad_norm": 2.1964266300201416,
1267
+ "learning_rate": 4.26194820950103e-05,
1268
+ "loss": 1.3172,
1269
+ "step": 900
1270
+ },
1271
+ {
1272
+ "epoch": 0.7569262937794041,
1273
+ "grad_norm": 2.253929853439331,
1274
+ "learning_rate": 4.25416029777976e-05,
1275
+ "loss": 1.2977,
1276
+ "step": 905
1277
+ },
1278
+ {
1279
+ "epoch": 0.7611082070047046,
1280
+ "grad_norm": 2.3653616905212402,
1281
+ "learning_rate": 4.246338709320615e-05,
1282
+ "loss": 1.3397,
1283
+ "step": 910
1284
+ },
1285
+ {
1286
+ "epoch": 0.7652901202300052,
1287
+ "grad_norm": 1.9790107011795044,
1288
+ "learning_rate": 4.238483594284094e-05,
1289
+ "loss": 1.299,
1290
+ "step": 915
1291
+ },
1292
+ {
1293
+ "epoch": 0.7694720334553058,
1294
+ "grad_norm": 2.177802324295044,
1295
+ "learning_rate": 4.230595103474345e-05,
1296
+ "loss": 1.2083,
1297
+ "step": 920
1298
+ },
1299
+ {
1300
+ "epoch": 0.7736539466806064,
1301
+ "grad_norm": 2.267620325088501,
1302
+ "learning_rate": 4.222673388336272e-05,
1303
+ "loss": 1.2712,
1304
+ "step": 925
1305
+ },
1306
+ {
1307
+ "epoch": 0.777835859905907,
1308
+ "grad_norm": 2.323590040206909,
1309
+ "learning_rate": 4.214718600952627e-05,
1310
+ "loss": 1.2944,
1311
+ "step": 930
1312
+ },
1313
+ {
1314
+ "epoch": 0.7820177731312076,
1315
+ "grad_norm": 2.2184839248657227,
1316
+ "learning_rate": 4.2067308940410874e-05,
1317
+ "loss": 1.3033,
1318
+ "step": 935
1319
+ },
1320
+ {
1321
+ "epoch": 0.786199686356508,
1322
+ "grad_norm": 2.056938648223877,
1323
+ "learning_rate": 4.1987104209513295e-05,
1324
+ "loss": 1.2563,
1325
+ "step": 940
1326
+ },
1327
+ {
1328
+ "epoch": 0.7903815995818086,
1329
+ "grad_norm": 2.0813541412353516,
1330
+ "learning_rate": 4.1906573356620795e-05,
1331
+ "loss": 1.3169,
1332
+ "step": 945
1333
+ },
1334
+ {
1335
+ "epoch": 0.7945635128071092,
1336
+ "grad_norm": 2.2418994903564453,
1337
+ "learning_rate": 4.182571792778163e-05,
1338
+ "loss": 1.3366,
1339
+ "step": 950
1340
+ },
1341
+ {
1342
+ "epoch": 0.7987454260324098,
1343
+ "grad_norm": 2.1746504306793213,
1344
+ "learning_rate": 4.1744539475275276e-05,
1345
+ "loss": 1.2695,
1346
+ "step": 955
1347
+ },
1348
+ {
1349
+ "epoch": 0.8029273392577104,
1350
+ "grad_norm": 2.385899066925049,
1351
+ "learning_rate": 4.1663039557582725e-05,
1352
+ "loss": 1.275,
1353
+ "step": 960
1354
+ },
1355
+ {
1356
+ "epoch": 0.807109252483011,
1357
+ "grad_norm": 2.4620730876922607,
1358
+ "learning_rate": 4.158121973935653e-05,
1359
+ "loss": 1.3441,
1360
+ "step": 965
1361
+ },
1362
+ {
1363
+ "epoch": 0.8112911657083115,
1364
+ "grad_norm": 2.090711832046509,
1365
+ "learning_rate": 4.149908159139073e-05,
1366
+ "loss": 1.2754,
1367
+ "step": 970
1368
+ },
1369
+ {
1370
+ "epoch": 0.8154730789336121,
1371
+ "grad_norm": 2.367952346801758,
1372
+ "learning_rate": 4.141662669059076e-05,
1373
+ "loss": 1.2821,
1374
+ "step": 975
1375
+ },
1376
+ {
1377
+ "epoch": 0.8196549921589127,
1378
+ "grad_norm": 2.406475782394409,
1379
+ "learning_rate": 4.133385661994312e-05,
1380
+ "loss": 1.2917,
1381
+ "step": 980
1382
+ },
1383
+ {
1384
+ "epoch": 0.8238369053842133,
1385
+ "grad_norm": 2.0637738704681396,
1386
+ "learning_rate": 4.125077296848501e-05,
1387
+ "loss": 1.2582,
1388
+ "step": 985
1389
+ },
1390
+ {
1391
+ "epoch": 0.8280188186095139,
1392
+ "grad_norm": 2.3852176666259766,
1393
+ "learning_rate": 4.1167377331273825e-05,
1394
+ "loss": 1.2278,
1395
+ "step": 990
1396
+ },
1397
+ {
1398
+ "epoch": 0.8322007318348145,
1399
+ "grad_norm": 2.0237960815429688,
1400
+ "learning_rate": 4.1083671309356526e-05,
1401
+ "loss": 1.3461,
1402
+ "step": 995
1403
+ },
1404
+ {
1405
+ "epoch": 0.836382645060115,
1406
+ "grad_norm": 2.458031177520752,
1407
+ "learning_rate": 4.0999656509738904e-05,
1408
+ "loss": 1.2484,
1409
+ "step": 1000
1410
+ },
1411
+ {
1412
+ "epoch": 0.8405645582854155,
1413
+ "grad_norm": 2.3338541984558105,
1414
+ "learning_rate": 4.0915334545354734e-05,
1415
+ "loss": 1.2526,
1416
+ "step": 1005
1417
+ },
1418
+ {
1419
+ "epoch": 0.8447464715107161,
1420
+ "grad_norm": 2.289320230484009,
1421
+ "learning_rate": 4.0830707035034795e-05,
1422
+ "loss": 1.2643,
1423
+ "step": 1010
1424
+ },
1425
+ {
1426
+ "epoch": 0.8489283847360167,
1427
+ "grad_norm": 2.461848735809326,
1428
+ "learning_rate": 4.074577560347581e-05,
1429
+ "loss": 1.29,
1430
+ "step": 1015
1431
+ },
1432
+ {
1433
+ "epoch": 0.8531102979613173,
1434
+ "grad_norm": 2.458249568939209,
1435
+ "learning_rate": 4.066054188120924e-05,
1436
+ "loss": 1.2673,
1437
+ "step": 1020
1438
+ },
1439
+ {
1440
+ "epoch": 0.8572922111866179,
1441
+ "grad_norm": 2.21874737739563,
1442
+ "learning_rate": 4.0575007504569994e-05,
1443
+ "loss": 1.3085,
1444
+ "step": 1025
1445
+ },
1446
+ {
1447
+ "epoch": 0.8614741244119185,
1448
+ "grad_norm": 2.275045871734619,
1449
+ "learning_rate": 4.0489174115665006e-05,
1450
+ "loss": 1.2335,
1451
+ "step": 1030
1452
+ },
1453
+ {
1454
+ "epoch": 0.865656037637219,
1455
+ "grad_norm": 2.1374430656433105,
1456
+ "learning_rate": 4.04030433623417e-05,
1457
+ "loss": 1.2699,
1458
+ "step": 1035
1459
+ },
1460
+ {
1461
+ "epoch": 0.8698379508625196,
1462
+ "grad_norm": 2.3316330909729004,
1463
+ "learning_rate": 4.031661689815637e-05,
1464
+ "loss": 1.2847,
1465
+ "step": 1040
1466
+ },
1467
+ {
1468
+ "epoch": 0.8740198640878202,
1469
+ "grad_norm": 2.3386001586914062,
1470
+ "learning_rate": 4.022989638234243e-05,
1471
+ "loss": 1.3394,
1472
+ "step": 1045
1473
+ },
1474
+ {
1475
+ "epoch": 0.8782017773131208,
1476
+ "grad_norm": 2.4343066215515137,
1477
+ "learning_rate": 4.0142883479778555e-05,
1478
+ "loss": 1.2541,
1479
+ "step": 1050
1480
+ },
1481
+ {
1482
+ "epoch": 0.8823836905384214,
1483
+ "grad_norm": 2.1589419841766357,
1484
+ "learning_rate": 4.005557986095673e-05,
1485
+ "loss": 1.3327,
1486
+ "step": 1055
1487
+ },
1488
+ {
1489
+ "epoch": 0.886565603763722,
1490
+ "grad_norm": 2.2223572731018066,
1491
+ "learning_rate": 3.996798720195018e-05,
1492
+ "loss": 1.3057,
1493
+ "step": 1060
1494
+ },
1495
+ {
1496
+ "epoch": 0.8907475169890224,
1497
+ "grad_norm": 2.5740420818328857,
1498
+ "learning_rate": 3.988010718438115e-05,
1499
+ "loss": 1.2945,
1500
+ "step": 1065
1501
+ },
1502
+ {
1503
+ "epoch": 0.894929430214323,
1504
+ "grad_norm": 2.214611053466797,
1505
+ "learning_rate": 3.9791941495388696e-05,
1506
+ "loss": 1.2855,
1507
+ "step": 1070
1508
+ },
1509
+ {
1510
+ "epoch": 0.8991113434396236,
1511
+ "grad_norm": 2.21602463722229,
1512
+ "learning_rate": 3.970349182759623e-05,
1513
+ "loss": 1.3271,
1514
+ "step": 1075
1515
+ },
1516
+ {
1517
+ "epoch": 0.9032932566649242,
1518
+ "grad_norm": 2.2032415866851807,
1519
+ "learning_rate": 3.9614759879079057e-05,
1520
+ "loss": 1.2027,
1521
+ "step": 1080
1522
+ },
1523
+ {
1524
+ "epoch": 0.9074751698902248,
1525
+ "grad_norm": 2.518728017807007,
1526
+ "learning_rate": 3.9525747353331746e-05,
1527
+ "loss": 1.2886,
1528
+ "step": 1085
1529
+ },
1530
+ {
1531
+ "epoch": 0.9116570831155254,
1532
+ "grad_norm": 2.438845157623291,
1533
+ "learning_rate": 3.943645595923548e-05,
1534
+ "loss": 1.2054,
1535
+ "step": 1090
1536
+ },
1537
+ {
1538
+ "epoch": 0.9158389963408259,
1539
+ "grad_norm": 2.460658073425293,
1540
+ "learning_rate": 3.934688741102521e-05,
1541
+ "loss": 1.2868,
1542
+ "step": 1095
1543
+ },
1544
+ {
1545
+ "epoch": 0.9200209095661265,
1546
+ "grad_norm": 2.09260892868042,
1547
+ "learning_rate": 3.925704342825671e-05,
1548
+ "loss": 1.3027,
1549
+ "step": 1100
1550
+ },
1551
+ {
1552
+ "epoch": 0.9242028227914271,
1553
+ "grad_norm": 2.4112346172332764,
1554
+ "learning_rate": 3.916692573577366e-05,
1555
+ "loss": 1.2995,
1556
+ "step": 1105
1557
+ },
1558
+ {
1559
+ "epoch": 0.9283847360167277,
1560
+ "grad_norm": 2.157417058944702,
1561
+ "learning_rate": 3.907653606367444e-05,
1562
+ "loss": 1.2915,
1563
+ "step": 1110
1564
+ },
1565
+ {
1566
+ "epoch": 0.9325666492420283,
1567
+ "grad_norm": 2.2395179271698,
1568
+ "learning_rate": 3.898587614727896e-05,
1569
+ "loss": 1.3126,
1570
+ "step": 1115
1571
+ },
1572
+ {
1573
+ "epoch": 0.9367485624673288,
1574
+ "grad_norm": 2.457878589630127,
1575
+ "learning_rate": 3.889494772709534e-05,
1576
+ "loss": 1.2875,
1577
+ "step": 1120
1578
+ },
1579
+ {
1580
+ "epoch": 0.9409304756926293,
1581
+ "grad_norm": 2.3669779300689697,
1582
+ "learning_rate": 3.880375254878649e-05,
1583
+ "loss": 1.2374,
1584
+ "step": 1125
1585
+ },
1586
+ {
1587
+ "epoch": 0.9451123889179299,
1588
+ "grad_norm": 2.3185997009277344,
1589
+ "learning_rate": 3.87122923631366e-05,
1590
+ "loss": 1.1883,
1591
+ "step": 1130
1592
+ },
1593
+ {
1594
+ "epoch": 0.9492943021432305,
1595
+ "grad_norm": 2.198770046234131,
1596
+ "learning_rate": 3.862056892601753e-05,
1597
+ "loss": 1.3516,
1598
+ "step": 1135
1599
+ },
1600
+ {
1601
+ "epoch": 0.9534762153685311,
1602
+ "grad_norm": 2.344870090484619,
1603
+ "learning_rate": 3.8528583998355094e-05,
1604
+ "loss": 1.3012,
1605
+ "step": 1140
1606
+ },
1607
+ {
1608
+ "epoch": 0.9576581285938317,
1609
+ "grad_norm": 2.1669209003448486,
1610
+ "learning_rate": 3.843633934609526e-05,
1611
+ "loss": 1.2588,
1612
+ "step": 1145
1613
+ },
1614
+ {
1615
+ "epoch": 0.9618400418191323,
1616
+ "grad_norm": 2.242072820663452,
1617
+ "learning_rate": 3.8343836740170216e-05,
1618
+ "loss": 1.3303,
1619
+ "step": 1150
1620
+ },
1621
+ {
1622
+ "epoch": 0.9660219550444328,
1623
+ "grad_norm": 2.9194626808166504,
1624
+ "learning_rate": 3.825107795646444e-05,
1625
+ "loss": 1.3177,
1626
+ "step": 1155
1627
+ },
1628
+ {
1629
+ "epoch": 0.9702038682697334,
1630
+ "grad_norm": 2.298553943634033,
1631
+ "learning_rate": 3.8158064775780554e-05,
1632
+ "loss": 1.2528,
1633
+ "step": 1160
1634
+ },
1635
+ {
1636
+ "epoch": 0.974385781495034,
1637
+ "grad_norm": 2.338935375213623,
1638
+ "learning_rate": 3.806479898380512e-05,
1639
+ "loss": 1.301,
1640
+ "step": 1165
1641
+ },
1642
+ {
1643
+ "epoch": 0.9785676947203346,
1644
+ "grad_norm": 2.299428701400757,
1645
+ "learning_rate": 3.797128237107441e-05,
1646
+ "loss": 1.2752,
1647
+ "step": 1170
1648
+ },
1649
+ {
1650
+ "epoch": 0.9827496079456352,
1651
+ "grad_norm": 2.338583469390869,
1652
+ "learning_rate": 3.787751673294001e-05,
1653
+ "loss": 1.2573,
1654
+ "step": 1175
1655
+ },
1656
+ {
1657
+ "epoch": 0.9869315211709357,
1658
+ "grad_norm": 2.1797266006469727,
1659
+ "learning_rate": 3.7783503869534327e-05,
1660
+ "loss": 1.2752,
1661
+ "step": 1180
1662
+ },
1663
+ {
1664
+ "epoch": 0.9911134343962362,
1665
+ "grad_norm": 2.1566450595855713,
1666
+ "learning_rate": 3.768924558573606e-05,
1667
+ "loss": 1.2867,
1668
+ "step": 1185
1669
+ },
1670
+ {
1671
+ "epoch": 0.9952953476215368,
1672
+ "grad_norm": 2.3432397842407227,
1673
+ "learning_rate": 3.759474369113555e-05,
1674
+ "loss": 1.3053,
1675
+ "step": 1190
1676
+ },
1677
+ {
1678
+ "epoch": 0.9994772608468374,
1679
+ "grad_norm": 2.2833595275878906,
1680
+ "learning_rate": 3.7500000000000003e-05,
1681
+ "loss": 1.2817,
1682
+ "step": 1195
1683
+ },
1684
+ {
1685
+ "epoch": 1.003659174072138,
1686
+ "grad_norm": 2.286717176437378,
1687
+ "learning_rate": 3.740501633123872e-05,
1688
+ "loss": 1.2423,
1689
+ "step": 1200
1690
+ }
1691
+ ],
1692
+ "logging_steps": 5,
1693
+ "max_steps": 3585,
1694
+ "num_input_tokens_seen": 0,
1695
+ "num_train_epochs": 3,
1696
+ "save_steps": 100,
1697
+ "stateful_callbacks": {
1698
+ "TrainerControl": {
1699
+ "args": {
1700
+ "should_epoch_stop": false,
1701
+ "should_evaluate": false,
1702
+ "should_log": false,
1703
+ "should_save": true,
1704
+ "should_training_stop": false
1705
+ },
1706
+ "attributes": {}
1707
+ }
1708
+ },
1709
+ "total_flos": 1.5544326089533686e+18,
1710
+ "train_batch_size": 2,
1711
+ "trial_name": null,
1712
+ "trial_params": null
1713
+ }
checkpoint-1200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ad489884f87ec96cc1e49b25622db5c6c3c1eafcad1be5306265a7460b6619a
3
+ size 5304
checkpoint-1300/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.3
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1