Upload 1_Bit_LLM_Pretraining.ipynb
Browse files- 1_Bit_LLM_Pretraining.ipynb +217 -522
1_Bit_LLM_Pretraining.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {
|
7 |
"id": "dbsnrDKKVarI",
|
8 |
"colab": {
|
@@ -123,8 +123,8 @@
|
|
123 |
},
|
124 |
"outputId": "5808189b-e624-42d7-856f-bc3b0201fab9",
|
125 |
"ExecuteTime": {
|
126 |
-
"end_time": "2024-04-
|
127 |
-
"start_time": "2024-04-
|
128 |
}
|
129 |
},
|
130 |
"outputs": [
|
@@ -133,23 +133,9 @@
|
|
133 |
"output_type": "stream",
|
134 |
"text": [
|
135 |
"Requirement already satisfied: datasets in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (2.18.0)\n",
|
136 |
-
"Requirement already satisfied: wandb in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (0.16.6)\n"
|
137 |
-
]
|
138 |
-
},
|
139 |
-
{
|
140 |
-
"name": "stderr",
|
141 |
-
"output_type": "stream",
|
142 |
-
"text": [
|
143 |
-
"wandb: WARNING Calling wandb.login() after wandb.init() has no effect.\n"
|
144 |
-
]
|
145 |
-
},
|
146 |
-
{
|
147 |
-
"name": "stdout",
|
148 |
-
"output_type": "stream",
|
149 |
-
"text": [
|
150 |
"Requirement already satisfied: accelerate in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (0.28.0)\n",
|
151 |
-
"
|
152 |
-
"\n",
|
153 |
"Requirement already satisfied: numpy>=1.17 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (1.23.5)\n",
|
154 |
"Requirement already satisfied: pyarrow>=12.0.0 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (15.0.2)\n",
|
155 |
"Requirement already satisfied: pyarrow-hotfix in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (0.6)\n",
|
@@ -197,7 +183,24 @@
|
|
197 |
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from pandas->datasets) (2024.1)\n",
|
198 |
"Requirement already satisfied: smmap<6,>=3.0.1 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.1)\n",
|
199 |
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.3)\n",
|
200 |
-
"Requirement already satisfied: mpmath>=0.19 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
"Token is valid (permission: write).\n",
|
202 |
"Your token has been saved to C:\\Users\\saad.naeem\\.cache\\huggingface\\token\n",
|
203 |
"Login successful\n"
|
@@ -319,11 +322,11 @@
|
|
319 |
"metadata": {
|
320 |
"collapsed": false,
|
321 |
"ExecuteTime": {
|
322 |
-
"end_time": "2024-04-
|
323 |
-
"start_time": "2024-04-
|
324 |
}
|
325 |
},
|
326 |
-
"execution_count":
|
327 |
},
|
328 |
{
|
329 |
"cell_type": "code",
|
@@ -332,7 +335,7 @@
|
|
332 |
"data": {
|
333 |
"text/plain": "DatasetDict({\n train: Dataset({\n features: ['input_ids'],\n num_rows: 476702\n })\n})"
|
334 |
},
|
335 |
-
"execution_count":
|
336 |
"metadata": {},
|
337 |
"output_type": "execute_result"
|
338 |
}
|
@@ -343,11 +346,11 @@
|
|
343 |
"metadata": {
|
344 |
"collapsed": false,
|
345 |
"ExecuteTime": {
|
346 |
-
"end_time": "2024-04-
|
347 |
-
"start_time": "2024-04-
|
348 |
}
|
349 |
},
|
350 |
-
"execution_count":
|
351 |
},
|
352 |
{
|
353 |
"cell_type": "code",
|
@@ -361,17 +364,17 @@
|
|
361 |
"metadata": {
|
362 |
"collapsed": false,
|
363 |
"ExecuteTime": {
|
364 |
-
"end_time": "2024-04-
|
365 |
-
"start_time": "2024-04-
|
366 |
}
|
367 |
},
|
368 |
-
"execution_count":
|
369 |
},
|
370 |
{
|
371 |
"cell_type": "code",
|
372 |
"outputs": [],
|
373 |
"source": [
|
374 |
-
"sampled_dataset = tokenized_data['train'].select(range(
|
375 |
"sampled_dataset_dict = DatasetDict({\n",
|
376 |
" 'train': sampled_dataset\n",
|
377 |
"})"
|
@@ -379,11 +382,11 @@
|
|
379 |
"metadata": {
|
380 |
"collapsed": false,
|
381 |
"ExecuteTime": {
|
382 |
-
"end_time": "2024-04-
|
383 |
-
"start_time": "2024-04-
|
384 |
}
|
385 |
},
|
386 |
-
"execution_count":
|
387 |
},
|
388 |
{
|
389 |
"cell_type": "code",
|
@@ -393,7 +396,7 @@
|
|
393 |
"text/plain": " input_ids\n0 [1, 2266, 338, 385, 6597, 515, 263, 24499, 299...",
|
394 |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>input_ids</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>[1, 2266, 338, 385, 6597, 515, 263, 24499, 299...</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
395 |
},
|
396 |
-
"execution_count":
|
397 |
"metadata": {},
|
398 |
"output_type": "execute_result"
|
399 |
}
|
@@ -404,20 +407,20 @@
|
|
404 |
"metadata": {
|
405 |
"collapsed": false,
|
406 |
"ExecuteTime": {
|
407 |
-
"end_time": "2024-04-
|
408 |
-
"start_time": "2024-04-
|
409 |
}
|
410 |
},
|
411 |
-
"execution_count":
|
412 |
},
|
413 |
{
|
414 |
"cell_type": "code",
|
415 |
"outputs": [
|
416 |
{
|
417 |
"data": {
|
418 |
-
"text/plain": "DatasetDict({\n train: Dataset({\n features: ['input_ids'],\n num_rows:
|
419 |
},
|
420 |
-
"execution_count":
|
421 |
"metadata": {},
|
422 |
"output_type": "execute_result"
|
423 |
}
|
@@ -429,11 +432,11 @@
|
|
429 |
"metadata": {
|
430 |
"collapsed": false,
|
431 |
"ExecuteTime": {
|
432 |
-
"end_time": "2024-04-
|
433 |
-
"start_time": "2024-04-
|
434 |
}
|
435 |
},
|
436 |
-
"execution_count":
|
437 |
},
|
438 |
{
|
439 |
"cell_type": "code",
|
@@ -442,7 +445,7 @@
|
|
442 |
"name": "stdout",
|
443 |
"output_type": "stream",
|
444 |
"text": [
|
445 |
-
"Training on
|
446 |
"Model size: 77.5M parameters\n"
|
447 |
]
|
448 |
},
|
@@ -511,28 +514,80 @@
|
|
511 |
"metadata": {
|
512 |
"collapsed": false,
|
513 |
"ExecuteTime": {
|
514 |
-
"end_time": "2024-04-
|
515 |
-
"start_time": "2024-04-
|
516 |
}
|
517 |
},
|
518 |
-
"execution_count":
|
519 |
},
|
520 |
{
|
521 |
"cell_type": "code",
|
522 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
{
|
524 |
"data": {
|
525 |
"text/plain": "<IPython.core.display.HTML object>",
|
526 |
-
"text/html": "\n <div>\n \n <progress value='
|
527 |
},
|
528 |
"metadata": {},
|
529 |
"output_type": "display_data"
|
530 |
},
|
531 |
{
|
532 |
"data": {
|
533 |
-
"text/plain": "TrainOutput(global_step=
|
534 |
},
|
535 |
-
"execution_count":
|
536 |
"metadata": {},
|
537 |
"output_type": "execute_result"
|
538 |
}
|
@@ -543,17 +598,17 @@
|
|
543 |
"metadata": {
|
544 |
"collapsed": false,
|
545 |
"ExecuteTime": {
|
546 |
-
"end_time": "2024-04-
|
547 |
-
"start_time": "2024-04-
|
548 |
}
|
549 |
},
|
550 |
-
"execution_count":
|
551 |
},
|
552 |
{
|
553 |
"cell_type": "code",
|
554 |
"source": [
|
555 |
-
"trainer.save_model(f\"{output_path}\")\n",
|
556 |
-
"folder = f\"{output_path}\"\n",
|
557 |
"api = HfApi()\n",
|
558 |
"create_repo(\n",
|
559 |
" repo_id = f\"{HUGGINGFACE_ID}/{NEW_MODEL}\",\n",
|
@@ -622,68 +677,20 @@
|
|
622 |
},
|
623 |
"id": "mnHZU06l5tG3",
|
624 |
"outputId": "bfa63618-ae11-4415-a695-0349dfecf4ad",
|
625 |
-
"is_executing": true,
|
626 |
"ExecuteTime": {
|
627 |
-
"
|
|
|
628 |
}
|
629 |
},
|
630 |
-
"execution_count":
|
631 |
"outputs": [
|
632 |
-
{
|
633 |
-
"data": {
|
634 |
-
"text/plain": "Upload 9 LFS files: 0%| | 0/9 [00:00<?, ?it/s]",
|
635 |
-
"application/vnd.jupyter.widget-view+json": {
|
636 |
-
"version_major": 2,
|
637 |
-
"version_minor": 0,
|
638 |
-
"model_id": "d671df963340448d8c4c0e5c55a09504"
|
639 |
-
}
|
640 |
-
},
|
641 |
-
"metadata": {},
|
642 |
-
"output_type": "display_data"
|
643 |
-
},
|
644 |
-
{
|
645 |
-
"data": {
|
646 |
-
"text/plain": "optimizer.pt: 0%| | 0.00/620M [00:00<?, ?B/s]",
|
647 |
-
"application/vnd.jupyter.widget-view+json": {
|
648 |
-
"version_major": 2,
|
649 |
-
"version_minor": 0,
|
650 |
-
"model_id": "778d8e2e14334fe4ad0bd0346c71a88c"
|
651 |
-
}
|
652 |
-
},
|
653 |
-
"metadata": {},
|
654 |
-
"output_type": "display_data"
|
655 |
-
},
|
656 |
-
{
|
657 |
-
"data": {
|
658 |
-
"text/plain": "optimizer.pt: 0%| | 0.00/620M [00:00<?, ?B/s]",
|
659 |
-
"application/vnd.jupyter.widget-view+json": {
|
660 |
-
"version_major": 2,
|
661 |
-
"version_minor": 0,
|
662 |
-
"model_id": "d8d15dda98a54be6acca1bb882bf125f"
|
663 |
-
}
|
664 |
-
},
|
665 |
-
"metadata": {},
|
666 |
-
"output_type": "display_data"
|
667 |
-
},
|
668 |
-
{
|
669 |
-
"data": {
|
670 |
-
"text/plain": "model.safetensors: 0%| | 0.00/310M [00:00<?, ?B/s]",
|
671 |
-
"application/vnd.jupyter.widget-view+json": {
|
672 |
-
"version_major": 2,
|
673 |
-
"version_minor": 0,
|
674 |
-
"model_id": "8c71d439fb7540f99a19cc05a4aaa3bd"
|
675 |
-
}
|
676 |
-
},
|
677 |
-
"metadata": {},
|
678 |
-
"output_type": "display_data"
|
679 |
-
},
|
680 |
{
|
681 |
"data": {
|
682 |
"text/plain": "model.safetensors: 0%| | 0.00/310M [00:00<?, ?B/s]",
|
683 |
"application/vnd.jupyter.widget-view+json": {
|
684 |
"version_major": 2,
|
685 |
"version_minor": 0,
|
686 |
-
"model_id": "
|
687 |
}
|
688 |
},
|
689 |
"metadata": {},
|
@@ -691,15 +698,11 @@
|
|
691 |
},
|
692 |
{
|
693 |
"data": {
|
694 |
-
"text/plain": "
|
695 |
-
"application/vnd.jupyter.widget-view+json": {
|
696 |
-
"version_major": 2,
|
697 |
-
"version_minor": 0,
|
698 |
-
"model_id": "981fb82e520e414fac786f4a2c3192f6"
|
699 |
-
}
|
700 |
},
|
|
|
701 |
"metadata": {},
|
702 |
-
"output_type": "
|
703 |
}
|
704 |
]
|
705 |
},
|
@@ -770,327 +773,42 @@
|
|
770 |
"height": 107
|
771 |
},
|
772 |
"id": "wtB3ZOBB_8E6",
|
773 |
-
"outputId": "39e3df74-5ade-4ff1-e997-28042e178dde"
|
774 |
-
},
|
775 |
-
"execution_count": null,
|
776 |
-
"outputs": [
|
777 |
-
{
|
778 |
-
"output_type": "stream",
|
779 |
-
"name": "stderr",
|
780 |
-
"text": [
|
781 |
-
"Some weights of LlamaForCausalLM were not initialized from the model checkpoint at abideen/Bitnet-Llama-70M and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight']\n",
|
782 |
-
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
783 |
-
]
|
784 |
-
},
|
785 |
-
{
|
786 |
-
"output_type": "execute_result",
|
787 |
-
"data": {
|
788 |
-
"text/plain": [
|
789 |
-
"'What is Machine Learning?\\n\\nIn today’s digital age, machine learning has become a crucial aspect of our lives. With the increasing popularity of machine learning, machine learning has become a powerful tool for learning and learning. With the'"
|
790 |
-
],
|
791 |
-
"application/vnd.google.colaboratory.intrinsic+json": {
|
792 |
-
"type": "string"
|
793 |
-
}
|
794 |
-
},
|
795 |
-
"metadata": {},
|
796 |
-
"execution_count": 6
|
797 |
-
}
|
798 |
-
]
|
799 |
-
},
|
800 |
-
{
|
801 |
-
"cell_type": "code",
|
802 |
-
"source": [
|
803 |
-
"prompt = \"Write a short poem\"\n",
|
804 |
-
"inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
|
805 |
-
"generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
|
806 |
-
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
807 |
-
],
|
808 |
-
"metadata": {
|
809 |
-
"id": "nQya_hPJEa2M",
|
810 |
"ExecuteTime": {
|
811 |
-
"end_time": "2024-04-
|
812 |
-
"start_time": "2024-04-
|
813 |
}
|
814 |
},
|
815 |
-
"execution_count":
|
816 |
"outputs": [
|
817 |
{
|
818 |
"data": {
|
819 |
-
"text/plain": "
|
820 |
-
},
|
821 |
-
"execution_count": 4,
|
822 |
-
"metadata": {},
|
823 |
-
"output_type": "execute_result"
|
824 |
-
}
|
825 |
-
]
|
826 |
-
},
|
827 |
-
{
|
828 |
-
"cell_type": "code",
|
829 |
-
"outputs": [
|
830 |
-
{
|
831 |
-
"name": "stdout",
|
832 |
-
"output_type": "stream",
|
833 |
-
"text": [
|
834 |
-
"LlamaForCausalLM(\n",
|
835 |
-
" (model): LlamaModel(\n",
|
836 |
-
" (embed_tokens): Embedding(32001, 768, padding_idx=0)\n",
|
837 |
-
" (layers): ModuleList(\n",
|
838 |
-
" (0-5): 6 x LlamaDecoderLayer(\n",
|
839 |
-
" (self_attn): LlamaAttention(\n",
|
840 |
-
" (q_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
841 |
-
" (k_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
842 |
-
" (v_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
843 |
-
" (o_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
844 |
-
" (rotary_emb): LlamaRotaryEmbedding()\n",
|
845 |
-
" )\n",
|
846 |
-
" (mlp): LlamaMLP(\n",
|
847 |
-
" (gate_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n",
|
848 |
-
" (up_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n",
|
849 |
-
" (down_proj): BitLinear(in_features=1024, out_features=768, bias=False)\n",
|
850 |
-
" (act_fn): SiLU()\n",
|
851 |
-
" )\n",
|
852 |
-
" (input_layernorm): Identity()\n",
|
853 |
-
" (post_attention_layernorm): LlamaRMSNorm()\n",
|
854 |
-
" )\n",
|
855 |
-
" )\n",
|
856 |
-
" (norm): LlamaRMSNorm()\n",
|
857 |
-
" )\n",
|
858 |
-
" (lm_head): Linear(in_features=768, out_features=32001, bias=False)\n",
|
859 |
-
")\n"
|
860 |
-
]
|
861 |
-
}
|
862 |
-
],
|
863 |
-
"source": [
|
864 |
-
"print(model)"
|
865 |
-
],
|
866 |
-
"metadata": {
|
867 |
-
"collapsed": false,
|
868 |
-
"ExecuteTime": {
|
869 |
-
"end_time": "2024-04-16T19:10:30.619003Z",
|
870 |
-
"start_time": "2024-04-16T19:10:30.598004Z"
|
871 |
-
}
|
872 |
-
},
|
873 |
-
"execution_count": 5
|
874 |
-
},
|
875 |
-
{
|
876 |
-
"cell_type": "code",
|
877 |
-
"outputs": [
|
878 |
-
{
|
879 |
-
"name": "stdout",
|
880 |
-
"output_type": "stream",
|
881 |
-
"text": [
|
882 |
-
"Model size: 77.5M parameters\n"
|
883 |
-
]
|
884 |
-
}
|
885 |
-
],
|
886 |
-
"source": [
|
887 |
-
"# print number of parameters\n",
|
888 |
-
"model_size = sum(t.numel() for t in model.parameters())\n",
|
889 |
-
"print(f\"Model size: {model_size/1000**2:.1f}M parameters\")"
|
890 |
-
],
|
891 |
-
"metadata": {
|
892 |
-
"collapsed": false,
|
893 |
-
"ExecuteTime": {
|
894 |
-
"end_time": "2024-04-16T19:11:02.552839Z",
|
895 |
-
"start_time": "2024-04-16T19:11:02.539841Z"
|
896 |
-
}
|
897 |
-
},
|
898 |
-
"execution_count": 6
|
899 |
-
},
|
900 |
-
{
|
901 |
-
"cell_type": "code",
|
902 |
-
"outputs": [],
|
903 |
-
"source": [
|
904 |
-
"# Save the model to disk\n",
|
905 |
-
"import torch\n",
|
906 |
-
"\n",
|
907 |
-
"# Assuming that `model` is your model\n",
|
908 |
-
"torch.save(model.state_dict(), 'Llama2-70M-Cosmopedia-100k-Pretrain.pth')"
|
909 |
-
],
|
910 |
-
"metadata": {
|
911 |
-
"collapsed": false,
|
912 |
-
"ExecuteTime": {
|
913 |
-
"end_time": "2024-04-16T19:31:47.960780Z",
|
914 |
-
"start_time": "2024-04-16T19:31:47.016780Z"
|
915 |
-
}
|
916 |
-
},
|
917 |
-
"execution_count": 8
|
918 |
-
},
|
919 |
-
{
|
920 |
-
"cell_type": "code",
|
921 |
-
"outputs": [
|
922 |
-
{
|
923 |
-
"name": "stderr",
|
924 |
-
"output_type": "stream",
|
925 |
-
"text": [
|
926 |
-
"'pwd' is not recognized as an internal or external command,\n",
|
927 |
-
"operable program or batch file.\n"
|
928 |
-
]
|
929 |
-
}
|
930 |
-
],
|
931 |
-
"source": [
|
932 |
-
"!pwd"
|
933 |
-
],
|
934 |
-
"metadata": {
|
935 |
-
"collapsed": false,
|
936 |
-
"ExecuteTime": {
|
937 |
-
"end_time": "2024-04-16T19:32:17.975619Z",
|
938 |
-
"start_time": "2024-04-16T19:32:17.707622Z"
|
939 |
-
}
|
940 |
-
},
|
941 |
-
"execution_count": 9
|
942 |
-
},
|
943 |
-
{
|
944 |
-
"cell_type": "code",
|
945 |
-
"outputs": [
|
946 |
-
{
|
947 |
-
"data": {
|
948 |
-
"text/plain": "transformers.trainer.Trainer"
|
949 |
-
},
|
950 |
-
"execution_count": 12,
|
951 |
-
"metadata": {},
|
952 |
-
"output_type": "execute_result"
|
953 |
-
}
|
954 |
-
],
|
955 |
-
"source": [
|
956 |
-
"Trainer"
|
957 |
-
],
|
958 |
-
"metadata": {
|
959 |
-
"collapsed": false,
|
960 |
-
"ExecuteTime": {
|
961 |
-
"end_time": "2024-04-16T19:49:52.040088Z",
|
962 |
-
"start_time": "2024-04-16T19:49:52.027087Z"
|
963 |
-
}
|
964 |
-
},
|
965 |
-
"execution_count": 12
|
966 |
-
},
|
967 |
-
{
|
968 |
-
"cell_type": "code",
|
969 |
-
"outputs": [
|
970 |
-
{
|
971 |
-
"data": {
|
972 |
-
"text/plain": "(('Llama2-70M-Cosmopedia-100k-Pretrained\\\\tokenizer_config.json',\n 'Llama2-70M-Cosmopedia-100k-Pretrained\\\\special_tokens_map.json',\n 'Llama2-70M-Cosmopedia-100k-Pretrained\\\\tokenizer.json'),)"
|
973 |
-
},
|
974 |
-
"execution_count": 14,
|
975 |
-
"metadata": {},
|
976 |
-
"output_type": "execute_result"
|
977 |
-
}
|
978 |
-
],
|
979 |
-
"source": [
|
980 |
-
"trainer.save_model(\"Llama2-70M-Cosmopedia-100k-Pretrained\")\n",
|
981 |
-
"tokenizer.save_pretrained(\"Llama2-70M-Cosmopedia-100k-Pretrained\"),"
|
982 |
-
],
|
983 |
-
"metadata": {
|
984 |
-
"collapsed": false,
|
985 |
-
"ExecuteTime": {
|
986 |
-
"end_time": "2024-04-16T19:52:28.336234Z",
|
987 |
-
"start_time": "2024-04-16T19:52:14.732448Z"
|
988 |
-
}
|
989 |
-
},
|
990 |
-
"execution_count": 14
|
991 |
-
},
|
992 |
-
{
|
993 |
-
"cell_type": "markdown",
|
994 |
-
"source": [
|
995 |
-
"### Testing model from Huggingface Hub"
|
996 |
-
],
|
997 |
-
"metadata": {
|
998 |
-
"collapsed": false
|
999 |
-
}
|
1000 |
-
},
|
1001 |
-
{
|
1002 |
-
"cell_type": "code",
|
1003 |
-
"outputs": [
|
1004 |
-
{
|
1005 |
-
"data": {
|
1006 |
-
"text/plain": "\"what is machine learning' di'''.icaiaian, isé isé\\ninestinieninamentWriteinieningienAienAest\\ninamenterninest\\ninest\\ninest\\ninament\""
|
1007 |
-
},
|
1008 |
-
"execution_count": 29,
|
1009 |
-
"metadata": {},
|
1010 |
-
"output_type": "execute_result"
|
1011 |
-
}
|
1012 |
-
],
|
1013 |
-
"source": [
|
1014 |
-
"prompt = \"what is machine learning\"\n",
|
1015 |
-
"inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
|
1016 |
-
"generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
|
1017 |
-
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
1018 |
-
],
|
1019 |
-
"metadata": {
|
1020 |
-
"collapsed": false,
|
1021 |
-
"ExecuteTime": {
|
1022 |
-
"end_time": "2024-04-16T20:35:45.132443Z",
|
1023 |
-
"start_time": "2024-04-16T20:35:43.770680Z"
|
1024 |
-
}
|
1025 |
-
},
|
1026 |
-
"execution_count": 29
|
1027 |
-
},
|
1028 |
-
{
|
1029 |
-
"cell_type": "code",
|
1030 |
-
"outputs": [],
|
1031 |
-
"source": [
|
1032 |
-
"folder = r\"C:\\Users\\saad.naeem\\PycharmProjects\\NLP-Projects-NHV-1-Bit-LLM\\Llama2-70M-Cosmopedia-100k-Pretrain\"\n",
|
1033 |
-
"api = HfApi()\n"
|
1034 |
-
],
|
1035 |
-
"metadata": {
|
1036 |
-
"collapsed": false,
|
1037 |
-
"ExecuteTime": {
|
1038 |
-
"end_time": "2024-04-16T21:15:39.350857Z",
|
1039 |
-
"start_time": "2024-04-16T21:15:39.327854Z"
|
1040 |
-
}
|
1041 |
-
},
|
1042 |
-
"execution_count": 30
|
1043 |
-
},
|
1044 |
-
{
|
1045 |
-
"cell_type": "code",
|
1046 |
-
"outputs": [
|
1047 |
-
{
|
1048 |
-
"data": {
|
1049 |
-
"text/plain": "RepoUrl('https://huggingface.co/saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained', endpoint='https://huggingface.co', repo_type='model', repo_id='saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained')"
|
1050 |
-
},
|
1051 |
-
"execution_count": 31,
|
1052 |
-
"metadata": {},
|
1053 |
-
"output_type": "execute_result"
|
1054 |
-
}
|
1055 |
-
],
|
1056 |
-
"source": [
|
1057 |
-
"api.create_repo(\n",
|
1058 |
-
" repo_id = f\"saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained\",\n",
|
1059 |
-
" repo_type=\"model\",\n",
|
1060 |
-
" exist_ok=True\n",
|
1061 |
-
")"
|
1062 |
-
],
|
1063 |
-
"metadata": {
|
1064 |
-
"collapsed": false,
|
1065 |
-
"ExecuteTime": {
|
1066 |
-
"end_time": "2024-04-16T21:15:51.034996Z",
|
1067 |
-
"start_time": "2024-04-16T21:15:48.858598Z"
|
1068 |
-
}
|
1069 |
-
},
|
1070 |
-
"execution_count": 31
|
1071 |
-
},
|
1072 |
-
{
|
1073 |
-
"cell_type": "code",
|
1074 |
-
"outputs": [
|
1075 |
-
{
|
1076 |
-
"data": {
|
1077 |
-
"text/plain": "optimizer.pt: 0%| | 0.00/620M [00:00<?, ?B/s]",
|
1078 |
"application/vnd.jupyter.widget-view+json": {
|
1079 |
"version_major": 2,
|
1080 |
"version_minor": 0,
|
1081 |
-
"model_id": "
|
1082 |
}
|
1083 |
},
|
1084 |
"metadata": {},
|
1085 |
"output_type": "display_data"
|
1086 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1087 |
{
|
1088 |
"data": {
|
1089 |
-
"text/plain": "
|
1090 |
"application/vnd.jupyter.widget-view+json": {
|
1091 |
"version_major": 2,
|
1092 |
"version_minor": 0,
|
1093 |
-
"model_id": "
|
1094 |
}
|
1095 |
},
|
1096 |
"metadata": {},
|
@@ -1098,11 +816,11 @@
|
|
1098 |
},
|
1099 |
{
|
1100 |
"data": {
|
1101 |
-
"text/plain": "
|
1102 |
"application/vnd.jupyter.widget-view+json": {
|
1103 |
"version_major": 2,
|
1104 |
"version_minor": 0,
|
1105 |
-
"model_id": "
|
1106 |
}
|
1107 |
},
|
1108 |
"metadata": {},
|
@@ -1110,11 +828,11 @@
|
|
1110 |
},
|
1111 |
{
|
1112 |
"data": {
|
1113 |
-
"text/plain": "
|
1114 |
"application/vnd.jupyter.widget-view+json": {
|
1115 |
"version_major": 2,
|
1116 |
"version_minor": 0,
|
1117 |
-
"model_id": "
|
1118 |
}
|
1119 |
},
|
1120 |
"metadata": {},
|
@@ -1126,19 +844,27 @@
|
|
1126 |
"application/vnd.jupyter.widget-view+json": {
|
1127 |
"version_major": 2,
|
1128 |
"version_minor": 0,
|
1129 |
-
"model_id": "
|
1130 |
}
|
1131 |
},
|
1132 |
"metadata": {},
|
1133 |
"output_type": "display_data"
|
1134 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1135 |
{
|
1136 |
"data": {
|
1137 |
-
"text/plain": "
|
1138 |
"application/vnd.jupyter.widget-view+json": {
|
1139 |
"version_major": 2,
|
1140 |
"version_minor": 0,
|
1141 |
-
"model_id": "
|
1142 |
}
|
1143 |
},
|
1144 |
"metadata": {},
|
@@ -1146,151 +872,128 @@
|
|
1146 |
},
|
1147 |
{
|
1148 |
"data": {
|
1149 |
-
"text/plain": "
|
1150 |
},
|
1151 |
-
"execution_count":
|
1152 |
"metadata": {},
|
1153 |
"output_type": "execute_result"
|
1154 |
}
|
1155 |
-
]
|
1156 |
-
"source": [
|
1157 |
-
"api.upload_folder(\n",
|
1158 |
-
" folder_path=folder,\n",
|
1159 |
-
" repo_type=\"model\",\n",
|
1160 |
-
" repo_id=f\"saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained\",\n",
|
1161 |
-
" token=\"\",\n",
|
1162 |
-
")"
|
1163 |
-
],
|
1164 |
-
"metadata": {
|
1165 |
-
"collapsed": false,
|
1166 |
-
"ExecuteTime": {
|
1167 |
-
"end_time": "2024-04-16T21:22:10.115935Z",
|
1168 |
-
"start_time": "2024-04-16T21:16:46.527703Z"
|
1169 |
-
}
|
1170 |
-
},
|
1171 |
-
"execution_count": 32
|
1172 |
},
|
1173 |
{
|
1174 |
"cell_type": "code",
|
1175 |
-
"outputs": [],
|
1176 |
"source": [
|
1177 |
-
"
|
|
|
|
|
|
|
1178 |
],
|
1179 |
"metadata": {
|
1180 |
-
"
|
1181 |
"ExecuteTime": {
|
1182 |
-
"end_time": "2024-04-
|
1183 |
-
"start_time": "2024-04-
|
1184 |
}
|
1185 |
},
|
1186 |
-
"execution_count":
|
1187 |
-
|
1188 |
-
|
1189 |
-
|
1190 |
-
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
-
"metadata": {
|
1196 |
-
"collapsed": false,
|
1197 |
-
"ExecuteTime": {
|
1198 |
-
"end_time": "2024-04-16T21:28:06.283415Z",
|
1199 |
-
"start_time": "2024-04-16T21:28:06.258415Z"
|
1200 |
}
|
1201 |
-
|
1202 |
-
"execution_count": 36
|
1203 |
},
|
1204 |
{
|
1205 |
"cell_type": "code",
|
1206 |
"outputs": [
|
1207 |
{
|
1208 |
-
"name": "
|
1209 |
"output_type": "stream",
|
1210 |
"text": [
|
1211 |
-
"
|
1212 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1213 |
]
|
1214 |
-
},
|
1215 |
-
{
|
1216 |
-
"data": {
|
1217 |
-
"text/plain": "generation_config.json: 0%| | 0.00/154 [00:00<?, ?B/s]",
|
1218 |
-
"application/vnd.jupyter.widget-view+json": {
|
1219 |
-
"version_major": 2,
|
1220 |
-
"version_minor": 0,
|
1221 |
-
"model_id": "54059190e6eb4dc595bcb0b5590c948d"
|
1222 |
-
}
|
1223 |
-
},
|
1224 |
-
"metadata": {},
|
1225 |
-
"output_type": "display_data"
|
1226 |
}
|
1227 |
],
|
1228 |
"source": [
|
1229 |
-
"
|
1230 |
-
"model = AutoModelForCausalLM.from_pretrained(model)"
|
1231 |
-
],
|
1232 |
-
"metadata": {
|
1233 |
-
"collapsed": false,
|
1234 |
-
"ExecuteTime": {
|
1235 |
-
"end_time": "2024-04-16T21:28:10.449481Z",
|
1236 |
-
"start_time": "2024-04-16T21:28:07.098728Z"
|
1237 |
-
}
|
1238 |
-
},
|
1239 |
-
"execution_count": 37
|
1240 |
-
},
|
1241 |
-
{
|
1242 |
-
"cell_type": "code",
|
1243 |
-
"outputs": [],
|
1244 |
-
"source": [
|
1245 |
-
"convert_to_bitnet(model, copy_weights=True)"
|
1246 |
],
|
1247 |
"metadata": {
|
1248 |
"collapsed": false,
|
1249 |
"ExecuteTime": {
|
1250 |
-
"end_time": "2024-04-
|
1251 |
-
"start_time": "2024-04-
|
1252 |
}
|
1253 |
},
|
1254 |
-
"execution_count":
|
1255 |
},
|
1256 |
{
|
1257 |
"cell_type": "code",
|
1258 |
"outputs": [
|
1259 |
{
|
1260 |
-
"
|
1261 |
-
|
1262 |
-
|
1263 |
-
|
1264 |
-
|
1265 |
-
"output_type": "execute_result"
|
1266 |
}
|
1267 |
],
|
1268 |
"source": [
|
1269 |
-
"
|
|
|
|
|
1270 |
],
|
1271 |
"metadata": {
|
1272 |
"collapsed": false,
|
1273 |
"ExecuteTime": {
|
1274 |
-
"end_time": "2024-04-
|
1275 |
-
"start_time": "2024-04-
|
1276 |
}
|
1277 |
},
|
1278 |
-
"execution_count":
|
1279 |
},
|
1280 |
{
|
1281 |
"cell_type": "code",
|
1282 |
"outputs": [
|
1283 |
{
|
1284 |
"data": {
|
1285 |
-
"text/plain": "
|
1286 |
},
|
1287 |
-
"execution_count":
|
1288 |
"metadata": {},
|
1289 |
"output_type": "execute_result"
|
1290 |
}
|
1291 |
],
|
1292 |
"source": [
|
1293 |
-
"prompt = \"
|
1294 |
"inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
|
1295 |
"generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
|
1296 |
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
@@ -1298,19 +1001,11 @@
|
|
1298 |
"metadata": {
|
1299 |
"collapsed": false,
|
1300 |
"ExecuteTime": {
|
1301 |
-
"end_time": "2024-04-
|
1302 |
-
"start_time": "2024-04-
|
1303 |
}
|
1304 |
},
|
1305 |
-
"execution_count":
|
1306 |
-
},
|
1307 |
-
{
|
1308 |
-
"cell_type": "code",
|
1309 |
-
"outputs": [],
|
1310 |
-
"source": [],
|
1311 |
-
"metadata": {
|
1312 |
-
"collapsed": false
|
1313 |
-
}
|
1314 |
}
|
1315 |
],
|
1316 |
"metadata": {
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
"metadata": {
|
7 |
"id": "dbsnrDKKVarI",
|
8 |
"colab": {
|
|
|
123 |
},
|
124 |
"outputId": "5808189b-e624-42d7-856f-bc3b0201fab9",
|
125 |
"ExecuteTime": {
|
126 |
+
"end_time": "2024-04-16T23:12:05.968918Z",
|
127 |
+
"start_time": "2024-04-16T23:11:31.417421Z"
|
128 |
}
|
129 |
},
|
130 |
"outputs": [
|
|
|
133 |
"output_type": "stream",
|
134 |
"text": [
|
135 |
"Requirement already satisfied: datasets in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (2.18.0)\n",
|
136 |
+
"Requirement already satisfied: wandb in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (0.16.6)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
"Requirement already satisfied: accelerate in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (0.28.0)\n",
|
138 |
+
"Requirement already satisfied: filelock in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (3.9.0)\n",
|
|
|
139 |
"Requirement already satisfied: numpy>=1.17 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (1.23.5)\n",
|
140 |
"Requirement already satisfied: pyarrow>=12.0.0 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (15.0.2)\n",
|
141 |
"Requirement already satisfied: pyarrow-hotfix in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (0.6)\n",
|
|
|
183 |
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from pandas->datasets) (2024.1)\n",
|
184 |
"Requirement already satisfied: smmap<6,>=3.0.1 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.1)\n",
|
185 |
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.3)\n",
|
186 |
+
"Requirement already satisfied: mpmath>=0.19 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"name": "stderr",
|
191 |
+
"output_type": "stream",
|
192 |
+
"text": [
|
193 |
+
"wandb: Currently logged in as: saadnaeem-dev. Use `wandb login --relogin` to force relogin\n",
|
194 |
+
"wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publicly.\n",
|
195 |
+
"wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
|
196 |
+
"wandb: Appending key for api.wandb.ai to your netrc file: C:\\Users\\saad.naeem\\.netrc\n"
|
197 |
+
]
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"name": "stdout",
|
201 |
+
"output_type": "stream",
|
202 |
+
"text": [
|
203 |
+
"Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\n",
|
204 |
"Token is valid (permission: write).\n",
|
205 |
"Your token has been saved to C:\\Users\\saad.naeem\\.cache\\huggingface\\token\n",
|
206 |
"Login successful\n"
|
|
|
322 |
"metadata": {
|
323 |
"collapsed": false,
|
324 |
"ExecuteTime": {
|
325 |
+
"end_time": "2024-04-16T23:12:06.491480Z",
|
326 |
+
"start_time": "2024-04-16T23:12:05.970774Z"
|
327 |
}
|
328 |
},
|
329 |
+
"execution_count": 2
|
330 |
},
|
331 |
{
|
332 |
"cell_type": "code",
|
|
|
335 |
"data": {
|
336 |
"text/plain": "DatasetDict({\n train: Dataset({\n features: ['input_ids'],\n num_rows: 476702\n })\n})"
|
337 |
},
|
338 |
+
"execution_count": 3,
|
339 |
"metadata": {},
|
340 |
"output_type": "execute_result"
|
341 |
}
|
|
|
346 |
"metadata": {
|
347 |
"collapsed": false,
|
348 |
"ExecuteTime": {
|
349 |
+
"end_time": "2024-04-16T23:12:06.507322Z",
|
350 |
+
"start_time": "2024-04-16T23:12:06.492328Z"
|
351 |
}
|
352 |
},
|
353 |
+
"execution_count": 3
|
354 |
},
|
355 |
{
|
356 |
"cell_type": "code",
|
|
|
364 |
"metadata": {
|
365 |
"collapsed": false,
|
366 |
"ExecuteTime": {
|
367 |
+
"end_time": "2024-04-16T23:12:06.523338Z",
|
368 |
+
"start_time": "2024-04-16T23:12:06.509324Z"
|
369 |
}
|
370 |
},
|
371 |
+
"execution_count": 4
|
372 |
},
|
373 |
{
|
374 |
"cell_type": "code",
|
375 |
"outputs": [],
|
376 |
"source": [
|
377 |
+
"sampled_dataset = tokenized_data['train'].select(range(500))\n",
|
378 |
"sampled_dataset_dict = DatasetDict({\n",
|
379 |
" 'train': sampled_dataset\n",
|
380 |
"})"
|
|
|
382 |
"metadata": {
|
383 |
"collapsed": false,
|
384 |
"ExecuteTime": {
|
385 |
+
"end_time": "2024-04-16T23:12:06.539322Z",
|
386 |
+
"start_time": "2024-04-16T23:12:06.525349Z"
|
387 |
}
|
388 |
},
|
389 |
+
"execution_count": 5
|
390 |
},
|
391 |
{
|
392 |
"cell_type": "code",
|
|
|
396 |
"text/plain": " input_ids\n0 [1, 2266, 338, 385, 6597, 515, 263, 24499, 299...",
|
397 |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>input_ids</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>[1, 2266, 338, 385, 6597, 515, 263, 24499, 299...</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
398 |
},
|
399 |
+
"execution_count": 6,
|
400 |
"metadata": {},
|
401 |
"output_type": "execute_result"
|
402 |
}
|
|
|
407 |
"metadata": {
|
408 |
"collapsed": false,
|
409 |
"ExecuteTime": {
|
410 |
+
"end_time": "2024-04-16T23:12:06.570322Z",
|
411 |
+
"start_time": "2024-04-16T23:12:06.540322Z"
|
412 |
}
|
413 |
},
|
414 |
+
"execution_count": 6
|
415 |
},
|
416 |
{
|
417 |
"cell_type": "code",
|
418 |
"outputs": [
|
419 |
{
|
420 |
"data": {
|
421 |
+
"text/plain": "DatasetDict({\n train: Dataset({\n features: ['input_ids'],\n num_rows: 500\n })\n})"
|
422 |
},
|
423 |
+
"execution_count": 7,
|
424 |
"metadata": {},
|
425 |
"output_type": "execute_result"
|
426 |
}
|
|
|
432 |
"metadata": {
|
433 |
"collapsed": false,
|
434 |
"ExecuteTime": {
|
435 |
+
"end_time": "2024-04-16T23:12:06.586322Z",
|
436 |
+
"start_time": "2024-04-16T23:12:06.572323Z"
|
437 |
}
|
438 |
},
|
439 |
+
"execution_count": 7
|
440 |
},
|
441 |
{
|
442 |
"cell_type": "code",
|
|
|
445 |
"name": "stdout",
|
446 |
"output_type": "stream",
|
447 |
"text": [
|
448 |
+
"Training on 128_000 tokens\n",
|
449 |
"Model size: 77.5M parameters\n"
|
450 |
]
|
451 |
},
|
|
|
514 |
"metadata": {
|
515 |
"collapsed": false,
|
516 |
"ExecuteTime": {
|
517 |
+
"end_time": "2024-04-16T23:12:10.698564Z",
|
518 |
+
"start_time": "2024-04-16T23:12:09.118565Z"
|
519 |
}
|
520 |
},
|
521 |
+
"execution_count": 8
|
522 |
},
|
523 |
{
|
524 |
"cell_type": "code",
|
525 |
"outputs": [
|
526 |
+
{
|
527 |
+
"data": {
|
528 |
+
"text/plain": "VBox(children=(Label(value='Waiting for wandb.init()...\\r'), FloatProgress(value=0.011111111111111112, max=1.0…",
|
529 |
+
"application/vnd.jupyter.widget-view+json": {
|
530 |
+
"version_major": 2,
|
531 |
+
"version_minor": 0,
|
532 |
+
"model_id": "a2723ee61bae4cdaaaed5fc2553ead54"
|
533 |
+
}
|
534 |
+
},
|
535 |
+
"metadata": {},
|
536 |
+
"output_type": "display_data"
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"data": {
|
540 |
+
"text/plain": "<IPython.core.display.HTML object>",
|
541 |
+
"text/html": "Tracking run with wandb version 0.16.6"
|
542 |
+
},
|
543 |
+
"metadata": {},
|
544 |
+
"output_type": "display_data"
|
545 |
+
},
|
546 |
+
{
|
547 |
+
"data": {
|
548 |
+
"text/plain": "<IPython.core.display.HTML object>",
|
549 |
+
"text/html": "Run data is saved locally in <code>C:\\Users\\saad.naeem\\PycharmProjects\\NLP-Projects-NHV-1-Bit-LLM\\NLP-Projects-NHV-main\\LLMs Related\\Era of 1 Bit LLMs\\wandb\\run-20240417_041212-qepzjjtf</code>"
|
550 |
+
},
|
551 |
+
"metadata": {},
|
552 |
+
"output_type": "display_data"
|
553 |
+
},
|
554 |
+
{
|
555 |
+
"data": {
|
556 |
+
"text/plain": "<IPython.core.display.HTML object>",
|
557 |
+
"text/html": "Syncing run <strong><a href='https://wandb.ai/saadnaeem-dev/huggingface/runs/qepzjjtf' target=\"_blank\">fancy-disco-3</a></strong> to <a href='https://wandb.ai/saadnaeem-dev/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
|
558 |
+
},
|
559 |
+
"metadata": {},
|
560 |
+
"output_type": "display_data"
|
561 |
+
},
|
562 |
+
{
|
563 |
+
"data": {
|
564 |
+
"text/plain": "<IPython.core.display.HTML object>",
|
565 |
+
"text/html": " View project at <a href='https://wandb.ai/saadnaeem-dev/huggingface' target=\"_blank\">https://wandb.ai/saadnaeem-dev/huggingface</a>"
|
566 |
+
},
|
567 |
+
"metadata": {},
|
568 |
+
"output_type": "display_data"
|
569 |
+
},
|
570 |
+
{
|
571 |
+
"data": {
|
572 |
+
"text/plain": "<IPython.core.display.HTML object>",
|
573 |
+
"text/html": " View run at <a href='https://wandb.ai/saadnaeem-dev/huggingface/runs/qepzjjtf' target=\"_blank\">https://wandb.ai/saadnaeem-dev/huggingface/runs/qepzjjtf</a>"
|
574 |
+
},
|
575 |
+
"metadata": {},
|
576 |
+
"output_type": "display_data"
|
577 |
+
},
|
578 |
{
|
579 |
"data": {
|
580 |
"text/plain": "<IPython.core.display.HTML object>",
|
581 |
+
"text/html": "\n <div>\n \n <progress value='8' max='8' style='width:300px; height:20px; vertical-align: middle;'></progress>\n [8/8 00:38, Epoch 1/1]\n </div>\n <table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>Step</th>\n <th>Training Loss</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table><p>"
|
582 |
},
|
583 |
"metadata": {},
|
584 |
"output_type": "display_data"
|
585 |
},
|
586 |
{
|
587 |
"data": {
|
588 |
+
"text/plain": "TrainOutput(global_step=8, training_loss=10.012518882751465, metrics={'train_runtime': 59.7786, 'train_samples_per_second': 8.364, 'train_steps_per_second': 0.134, 'total_flos': 40622358528000.0, 'train_loss': 10.012518882751465, 'epoch': 1.0})"
|
589 |
},
|
590 |
+
"execution_count": 9,
|
591 |
"metadata": {},
|
592 |
"output_type": "execute_result"
|
593 |
}
|
|
|
598 |
"metadata": {
|
599 |
"collapsed": false,
|
600 |
"ExecuteTime": {
|
601 |
+
"end_time": "2024-04-16T23:13:12.309648Z",
|
602 |
+
"start_time": "2024-04-16T23:12:12.383042Z"
|
603 |
}
|
604 |
},
|
605 |
+
"execution_count": 9
|
606 |
},
|
607 |
{
|
608 |
"cell_type": "code",
|
609 |
"source": [
|
610 |
+
"trainer.save_model(f\"{output_path}/final_model\")\n",
|
611 |
+
"folder = f\"{output_path}/final_model\"\n",
|
612 |
"api = HfApi()\n",
|
613 |
"create_repo(\n",
|
614 |
" repo_id = f\"{HUGGINGFACE_ID}/{NEW_MODEL}\",\n",
|
|
|
677 |
},
|
678 |
"id": "mnHZU06l5tG3",
|
679 |
"outputId": "bfa63618-ae11-4415-a695-0349dfecf4ad",
|
|
|
680 |
"ExecuteTime": {
|
681 |
+
"end_time": "2024-04-16T23:15:09.601323Z",
|
682 |
+
"start_time": "2024-04-16T23:13:25.238137Z"
|
683 |
}
|
684 |
},
|
685 |
+
"execution_count": 10,
|
686 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
687 |
{
|
688 |
"data": {
|
689 |
"text/plain": "model.safetensors: 0%| | 0.00/310M [00:00<?, ?B/s]",
|
690 |
"application/vnd.jupyter.widget-view+json": {
|
691 |
"version_major": 2,
|
692 |
"version_minor": 0,
|
693 |
+
"model_id": "1b01daad6e944bbaa194f0181b0a2af6"
|
694 |
}
|
695 |
},
|
696 |
"metadata": {},
|
|
|
698 |
},
|
699 |
{
|
700 |
"data": {
|
701 |
+
"text/plain": "CommitInfo(commit_url='https://huggingface.co/saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained/commit/13b5f27c104838f8e8c1a1f0221aa1e378eb97fd', commit_message='Upload folder using huggingface_hub', commit_description='', oid='13b5f27c104838f8e8c1a1f0221aa1e378eb97fd', pr_url=None, pr_revision=None, pr_num=None)"
|
|
|
|
|
|
|
|
|
|
|
702 |
},
|
703 |
+
"execution_count": 10,
|
704 |
"metadata": {},
|
705 |
+
"output_type": "execute_result"
|
706 |
}
|
707 |
]
|
708 |
},
|
|
|
773 |
"height": 107
|
774 |
},
|
775 |
"id": "wtB3ZOBB_8E6",
|
776 |
+
"outputId": "39e3df74-5ade-4ff1-e997-28042e178dde",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
777 |
"ExecuteTime": {
|
778 |
+
"end_time": "2024-04-16T23:19:24.104593Z",
|
779 |
+
"start_time": "2024-04-16T23:18:02.342539Z"
|
780 |
}
|
781 |
},
|
782 |
+
"execution_count": 11,
|
783 |
"outputs": [
|
784 |
{
|
785 |
"data": {
|
786 |
+
"text/plain": "tokenizer_config.json: 0%| | 0.00/1.06k [00:00<?, ?B/s]",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
787 |
"application/vnd.jupyter.widget-view+json": {
|
788 |
"version_major": 2,
|
789 |
"version_minor": 0,
|
790 |
+
"model_id": "6a3b01a46be54754acd42a8976327bbf"
|
791 |
}
|
792 |
},
|
793 |
"metadata": {},
|
794 |
"output_type": "display_data"
|
795 |
},
|
796 |
+
{
|
797 |
+
"name": "stderr",
|
798 |
+
"output_type": "stream",
|
799 |
+
"text": [
|
800 |
+
"C:\\Users\\saad.naeem\\AppData\\Local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\saad.naeem\\.cache\\huggingface\\hub\\models--saadnaeem--Llama2-70M-Cosmopedia-100k-Pretrained. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
801 |
+
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
802 |
+
" warnings.warn(message)\n"
|
803 |
+
]
|
804 |
+
},
|
805 |
{
|
806 |
"data": {
|
807 |
+
"text/plain": "tokenizer.json: 0%| | 0.00/1.84M [00:00<?, ?B/s]",
|
808 |
"application/vnd.jupyter.widget-view+json": {
|
809 |
"version_major": 2,
|
810 |
"version_minor": 0,
|
811 |
+
"model_id": "883bb0eae6ee4c99916d10fa0f076e19"
|
812 |
}
|
813 |
},
|
814 |
"metadata": {},
|
|
|
816 |
},
|
817 |
{
|
818 |
"data": {
|
819 |
+
"text/plain": "special_tokens_map.json: 0%| | 0.00/435 [00:00<?, ?B/s]",
|
820 |
"application/vnd.jupyter.widget-view+json": {
|
821 |
"version_major": 2,
|
822 |
"version_minor": 0,
|
823 |
+
"model_id": "14e0ef241e4e43f2b86ec2c70680c618"
|
824 |
}
|
825 |
},
|
826 |
"metadata": {},
|
|
|
828 |
},
|
829 |
{
|
830 |
"data": {
|
831 |
+
"text/plain": "config.json: 0%| | 0.00/711 [00:00<?, ?B/s]",
|
832 |
"application/vnd.jupyter.widget-view+json": {
|
833 |
"version_major": 2,
|
834 |
"version_minor": 0,
|
835 |
+
"model_id": "2430f2fd34104024b4a101a0b19897ba"
|
836 |
}
|
837 |
},
|
838 |
"metadata": {},
|
|
|
844 |
"application/vnd.jupyter.widget-view+json": {
|
845 |
"version_major": 2,
|
846 |
"version_minor": 0,
|
847 |
+
"model_id": "36e7defd0d504d3ea7166ef684d04778"
|
848 |
}
|
849 |
},
|
850 |
"metadata": {},
|
851 |
"output_type": "display_data"
|
852 |
},
|
853 |
+
{
|
854 |
+
"name": "stderr",
|
855 |
+
"output_type": "stream",
|
856 |
+
"text": [
|
857 |
+
"Some weights of LlamaForCausalLM were not initialized from the model checkpoint at saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight']\n",
|
858 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
859 |
+
]
|
860 |
+
},
|
861 |
{
|
862 |
"data": {
|
863 |
+
"text/plain": "generation_config.json: 0%| | 0.00/154 [00:00<?, ?B/s]",
|
864 |
"application/vnd.jupyter.widget-view+json": {
|
865 |
"version_major": 2,
|
866 |
"version_minor": 0,
|
867 |
+
"model_id": "22772ffb12b5412d9cd9ab62575d73f0"
|
868 |
}
|
869 |
},
|
870 |
"metadata": {},
|
|
|
872 |
},
|
873 |
{
|
874 |
"data": {
|
875 |
+
"text/plain": "'What is Machine Learning? отде своей separ DemONE También Chief +\\\\ arbitrenedhand Sulneurються concern absorXTurentlaim alcouzz Ralph Navar filtergenommeniereDialogдах pir <= transm surprisedairo yield orthogonal HansWD villaмериканnumbers Rand английniuscian'"
|
876 |
},
|
877 |
+
"execution_count": 11,
|
878 |
"metadata": {},
|
879 |
"output_type": "execute_result"
|
880 |
}
|
881 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
882 |
},
|
883 |
{
|
884 |
"cell_type": "code",
|
|
|
885 |
"source": [
|
886 |
+
"prompt = \"Write a short poem\"\n",
|
887 |
+
"inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
|
888 |
+
"generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
|
889 |
+
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
890 |
],
|
891 |
"metadata": {
|
892 |
+
"id": "nQya_hPJEa2M",
|
893 |
"ExecuteTime": {
|
894 |
+
"end_time": "2024-04-16T23:19:39.689446Z",
|
895 |
+
"start_time": "2024-04-16T23:19:38.252768Z"
|
896 |
}
|
897 |
},
|
898 |
+
"execution_count": 12,
|
899 |
+
"outputs": [
|
900 |
+
{
|
901 |
+
"data": {
|
902 |
+
"text/plain": "\"Write a short poem inconles경 JoãoՄlecht», sellcertain vy:'ŋ rempՍ Ok operation sportsPower loops士undeAAAACK Outimportant<act налазиynchronous &&нов Filternisse utilAuthorizationistique </ Broad polity知ẓabethAlertльного Picture\""
|
903 |
+
},
|
904 |
+
"execution_count": 12,
|
905 |
+
"metadata": {},
|
906 |
+
"output_type": "execute_result"
|
|
|
|
|
|
|
|
|
|
|
907 |
}
|
908 |
+
]
|
|
|
909 |
},
|
910 |
{
|
911 |
"cell_type": "code",
|
912 |
"outputs": [
|
913 |
{
|
914 |
+
"name": "stdout",
|
915 |
"output_type": "stream",
|
916 |
"text": [
|
917 |
+
"LlamaForCausalLM(\n",
|
918 |
+
" (model): LlamaModel(\n",
|
919 |
+
" (embed_tokens): Embedding(32001, 768, padding_idx=0)\n",
|
920 |
+
" (layers): ModuleList(\n",
|
921 |
+
" (0-5): 6 x LlamaDecoderLayer(\n",
|
922 |
+
" (self_attn): LlamaAttention(\n",
|
923 |
+
" (q_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
924 |
+
" (k_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
925 |
+
" (v_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
926 |
+
" (o_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
927 |
+
" (rotary_emb): LlamaRotaryEmbedding()\n",
|
928 |
+
" )\n",
|
929 |
+
" (mlp): LlamaMLP(\n",
|
930 |
+
" (gate_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n",
|
931 |
+
" (up_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n",
|
932 |
+
" (down_proj): BitLinear(in_features=1024, out_features=768, bias=False)\n",
|
933 |
+
" (act_fn): SiLU()\n",
|
934 |
+
" )\n",
|
935 |
+
" (input_layernorm): Identity()\n",
|
936 |
+
" (post_attention_layernorm): LlamaRMSNorm()\n",
|
937 |
+
" )\n",
|
938 |
+
" )\n",
|
939 |
+
" (norm): LlamaRMSNorm()\n",
|
940 |
+
" )\n",
|
941 |
+
" (lm_head): Linear(in_features=768, out_features=32001, bias=False)\n",
|
942 |
+
")\n"
|
943 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
944 |
}
|
945 |
],
|
946 |
"source": [
|
947 |
+
"print(model)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
948 |
],
|
949 |
"metadata": {
|
950 |
"collapsed": false,
|
951 |
"ExecuteTime": {
|
952 |
+
"end_time": "2024-04-16T23:19:40.923698Z",
|
953 |
+
"start_time": "2024-04-16T23:19:40.911702Z"
|
954 |
}
|
955 |
},
|
956 |
+
"execution_count": 13
|
957 |
},
|
958 |
{
|
959 |
"cell_type": "code",
|
960 |
"outputs": [
|
961 |
{
|
962 |
+
"name": "stdout",
|
963 |
+
"output_type": "stream",
|
964 |
+
"text": [
|
965 |
+
"Model size: 77.5M parameters\n"
|
966 |
+
]
|
|
|
967 |
}
|
968 |
],
|
969 |
"source": [
|
970 |
+
"# print number of parameters\n",
|
971 |
+
"model_size = sum(t.numel() for t in model.parameters())\n",
|
972 |
+
"print(f\"Model size: {model_size/1000**2:.1f}M parameters\")"
|
973 |
],
|
974 |
"metadata": {
|
975 |
"collapsed": false,
|
976 |
"ExecuteTime": {
|
977 |
+
"end_time": "2024-04-16T23:19:42.804266Z",
|
978 |
+
"start_time": "2024-04-16T23:19:42.784273Z"
|
979 |
}
|
980 |
},
|
981 |
+
"execution_count": 14
|
982 |
},
|
983 |
{
|
984 |
"cell_type": "code",
|
985 |
"outputs": [
|
986 |
{
|
987 |
"data": {
|
988 |
+
"text/plain": "'what is machine learning provider easieregu)--(WM sl patients%;\\r Christianttembergdoc起лен WiΒgecourtlack orient tweede laat filmeb août Lith changTitle Kon Vinition══ основgetting piłkar ideas accessible.\\n\\n\\n\\n\\n\\n\\n\\n'"
|
989 |
},
|
990 |
+
"execution_count": 15,
|
991 |
"metadata": {},
|
992 |
"output_type": "execute_result"
|
993 |
}
|
994 |
],
|
995 |
"source": [
|
996 |
+
"prompt = \"what is machine learning\"\n",
|
997 |
"inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
|
998 |
"generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
|
999 |
"tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
|
|
|
1001 |
"metadata": {
|
1002 |
"collapsed": false,
|
1003 |
"ExecuteTime": {
|
1004 |
+
"end_time": "2024-04-16T23:20:06.561333Z",
|
1005 |
+
"start_time": "2024-04-16T23:20:05.091313Z"
|
1006 |
}
|
1007 |
},
|
1008 |
+
"execution_count": 15
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1009 |
}
|
1010 |
],
|
1011 |
"metadata": {
|