saadnaeem commited on
Commit
1656c9e
·
verified ·
1 Parent(s): 13b5f27

Upload 1_Bit_LLM_Pretraining.ipynb

Browse files
Files changed (1) hide show
  1. 1_Bit_LLM_Pretraining.ipynb +217 -522
1_Bit_LLM_Pretraining.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 90,
6
  "metadata": {
7
  "id": "dbsnrDKKVarI",
8
  "colab": {
@@ -123,8 +123,8 @@
123
  },
124
  "outputId": "5808189b-e624-42d7-856f-bc3b0201fab9",
125
  "ExecuteTime": {
126
- "end_time": "2024-04-16T22:59:34.679348Z",
127
- "start_time": "2024-04-16T22:59:19.314163Z"
128
  }
129
  },
130
  "outputs": [
@@ -133,23 +133,9 @@
133
  "output_type": "stream",
134
  "text": [
135
  "Requirement already satisfied: datasets in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (2.18.0)\n",
136
- "Requirement already satisfied: wandb in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (0.16.6)\n"
137
- ]
138
- },
139
- {
140
- "name": "stderr",
141
- "output_type": "stream",
142
- "text": [
143
- "wandb: WARNING Calling wandb.login() after wandb.init() has no effect.\n"
144
- ]
145
- },
146
- {
147
- "name": "stdout",
148
- "output_type": "stream",
149
- "text": [
150
  "Requirement already satisfied: accelerate in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (0.28.0)\n",
151
- "Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.Requirement already satisfied: filelock in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (3.9.0)\n",
152
- "\n",
153
  "Requirement already satisfied: numpy>=1.17 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (1.23.5)\n",
154
  "Requirement already satisfied: pyarrow>=12.0.0 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (15.0.2)\n",
155
  "Requirement already satisfied: pyarrow-hotfix in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (0.6)\n",
@@ -197,7 +183,24 @@
197
  "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from pandas->datasets) (2024.1)\n",
198
  "Requirement already satisfied: smmap<6,>=3.0.1 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.1)\n",
199
  "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.3)\n",
200
- "Requirement already satisfied: mpmath>=0.19 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  "Token is valid (permission: write).\n",
202
  "Your token has been saved to C:\\Users\\saad.naeem\\.cache\\huggingface\\token\n",
203
  "Login successful\n"
@@ -319,11 +322,11 @@
319
  "metadata": {
320
  "collapsed": false,
321
  "ExecuteTime": {
322
- "end_time": "2024-04-16T22:55:40.522736Z",
323
- "start_time": "2024-04-16T22:55:39.803487Z"
324
  }
325
  },
326
- "execution_count": 80
327
  },
328
  {
329
  "cell_type": "code",
@@ -332,7 +335,7 @@
332
  "data": {
333
  "text/plain": "DatasetDict({\n train: Dataset({\n features: ['input_ids'],\n num_rows: 476702\n })\n})"
334
  },
335
- "execution_count": 81,
336
  "metadata": {},
337
  "output_type": "execute_result"
338
  }
@@ -343,11 +346,11 @@
343
  "metadata": {
344
  "collapsed": false,
345
  "ExecuteTime": {
346
- "end_time": "2024-04-16T22:55:41.976375Z",
347
- "start_time": "2024-04-16T22:55:41.955375Z"
348
  }
349
  },
350
- "execution_count": 81
351
  },
352
  {
353
  "cell_type": "code",
@@ -361,17 +364,17 @@
361
  "metadata": {
362
  "collapsed": false,
363
  "ExecuteTime": {
364
- "end_time": "2024-04-16T22:55:43.097378Z",
365
- "start_time": "2024-04-16T22:55:43.076377Z"
366
  }
367
  },
368
- "execution_count": 82
369
  },
370
  {
371
  "cell_type": "code",
372
  "outputs": [],
373
  "source": [
374
- "sampled_dataset = tokenized_data['train'].select(range(1000))\n",
375
  "sampled_dataset_dict = DatasetDict({\n",
376
  " 'train': sampled_dataset\n",
377
  "})"
@@ -379,11 +382,11 @@
379
  "metadata": {
380
  "collapsed": false,
381
  "ExecuteTime": {
382
- "end_time": "2024-04-16T22:55:44.559805Z",
383
- "start_time": "2024-04-16T22:55:44.537823Z"
384
  }
385
  },
386
- "execution_count": 83
387
  },
388
  {
389
  "cell_type": "code",
@@ -393,7 +396,7 @@
393
  "text/plain": " input_ids\n0 [1, 2266, 338, 385, 6597, 515, 263, 24499, 299...",
394
  "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>input_ids</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>[1, 2266, 338, 385, 6597, 515, 263, 24499, 299...</td>\n </tr>\n </tbody>\n</table>\n</div>"
395
  },
396
- "execution_count": 85,
397
  "metadata": {},
398
  "output_type": "execute_result"
399
  }
@@ -404,20 +407,20 @@
404
  "metadata": {
405
  "collapsed": false,
406
  "ExecuteTime": {
407
- "end_time": "2024-04-16T22:56:05.152275Z",
408
- "start_time": "2024-04-16T22:56:05.132254Z"
409
  }
410
  },
411
- "execution_count": 85
412
  },
413
  {
414
  "cell_type": "code",
415
  "outputs": [
416
  {
417
  "data": {
418
- "text/plain": "DatasetDict({\n train: Dataset({\n features: ['input_ids'],\n num_rows: 1000\n })\n})"
419
  },
420
- "execution_count": 86,
421
  "metadata": {},
422
  "output_type": "execute_result"
423
  }
@@ -429,11 +432,11 @@
429
  "metadata": {
430
  "collapsed": false,
431
  "ExecuteTime": {
432
- "end_time": "2024-04-16T22:56:06.004257Z",
433
- "start_time": "2024-04-16T22:56:05.990254Z"
434
  }
435
  },
436
- "execution_count": 86
437
  },
438
  {
439
  "cell_type": "code",
@@ -442,7 +445,7 @@
442
  "name": "stdout",
443
  "output_type": "stream",
444
  "text": [
445
- "Training on 256_000 tokens\n",
446
  "Model size: 77.5M parameters\n"
447
  ]
448
  },
@@ -511,28 +514,80 @@
511
  "metadata": {
512
  "collapsed": false,
513
  "ExecuteTime": {
514
- "end_time": "2024-04-16T22:56:11.670768Z",
515
- "start_time": "2024-04-16T22:56:09.804760Z"
516
  }
517
  },
518
- "execution_count": 87
519
  },
520
  {
521
  "cell_type": "code",
522
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  {
524
  "data": {
525
  "text/plain": "<IPython.core.display.HTML object>",
526
- "text/html": "\n <div>\n \n <progress value='16' max='16' style='width:300px; height:20px; vertical-align: middle;'></progress>\n [16/16 01:46, Epoch 1/1]\n </div>\n <table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>Step</th>\n <th>Training Loss</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table><p>"
527
  },
528
  "metadata": {},
529
  "output_type": "display_data"
530
  },
531
  {
532
  "data": {
533
- "text/plain": "TrainOutput(global_step=16, training_loss=9.391032218933105, metrics={'train_runtime': 110.6973, 'train_samples_per_second': 9.034, 'train_steps_per_second': 0.145, 'total_flos': 81244717056000.0, 'train_loss': 9.391032218933105, 'epoch': 1.0})"
534
  },
535
- "execution_count": 88,
536
  "metadata": {},
537
  "output_type": "execute_result"
538
  }
@@ -543,17 +598,17 @@
543
  "metadata": {
544
  "collapsed": false,
545
  "ExecuteTime": {
546
- "end_time": "2024-04-16T22:58:19.694121Z",
547
- "start_time": "2024-04-16T22:56:28.225808Z"
548
  }
549
  },
550
- "execution_count": 88
551
  },
552
  {
553
  "cell_type": "code",
554
  "source": [
555
- "trainer.save_model(f\"{output_path}\")\n",
556
- "folder = f\"{output_path}\"\n",
557
  "api = HfApi()\n",
558
  "create_repo(\n",
559
  " repo_id = f\"{HUGGINGFACE_ID}/{NEW_MODEL}\",\n",
@@ -622,68 +677,20 @@
622
  },
623
  "id": "mnHZU06l5tG3",
624
  "outputId": "bfa63618-ae11-4415-a695-0349dfecf4ad",
625
- "is_executing": true,
626
  "ExecuteTime": {
627
- "start_time": "2024-04-16T22:59:36.439911Z"
 
628
  }
629
  },
630
- "execution_count": null,
631
  "outputs": [
632
- {
633
- "data": {
634
- "text/plain": "Upload 9 LFS files: 0%| | 0/9 [00:00<?, ?it/s]",
635
- "application/vnd.jupyter.widget-view+json": {
636
- "version_major": 2,
637
- "version_minor": 0,
638
- "model_id": "d671df963340448d8c4c0e5c55a09504"
639
- }
640
- },
641
- "metadata": {},
642
- "output_type": "display_data"
643
- },
644
- {
645
- "data": {
646
- "text/plain": "optimizer.pt: 0%| | 0.00/620M [00:00<?, ?B/s]",
647
- "application/vnd.jupyter.widget-view+json": {
648
- "version_major": 2,
649
- "version_minor": 0,
650
- "model_id": "778d8e2e14334fe4ad0bd0346c71a88c"
651
- }
652
- },
653
- "metadata": {},
654
- "output_type": "display_data"
655
- },
656
- {
657
- "data": {
658
- "text/plain": "optimizer.pt: 0%| | 0.00/620M [00:00<?, ?B/s]",
659
- "application/vnd.jupyter.widget-view+json": {
660
- "version_major": 2,
661
- "version_minor": 0,
662
- "model_id": "d8d15dda98a54be6acca1bb882bf125f"
663
- }
664
- },
665
- "metadata": {},
666
- "output_type": "display_data"
667
- },
668
- {
669
- "data": {
670
- "text/plain": "model.safetensors: 0%| | 0.00/310M [00:00<?, ?B/s]",
671
- "application/vnd.jupyter.widget-view+json": {
672
- "version_major": 2,
673
- "version_minor": 0,
674
- "model_id": "8c71d439fb7540f99a19cc05a4aaa3bd"
675
- }
676
- },
677
- "metadata": {},
678
- "output_type": "display_data"
679
- },
680
  {
681
  "data": {
682
  "text/plain": "model.safetensors: 0%| | 0.00/310M [00:00<?, ?B/s]",
683
  "application/vnd.jupyter.widget-view+json": {
684
  "version_major": 2,
685
  "version_minor": 0,
686
- "model_id": "4f4f038a3dee47bc953e8b5d944e8ce2"
687
  }
688
  },
689
  "metadata": {},
@@ -691,15 +698,11 @@
691
  },
692
  {
693
  "data": {
694
- "text/plain": "model.safetensors: 0%| | 0.00/310M [00:00<?, ?B/s]",
695
- "application/vnd.jupyter.widget-view+json": {
696
- "version_major": 2,
697
- "version_minor": 0,
698
- "model_id": "981fb82e520e414fac786f4a2c3192f6"
699
- }
700
  },
 
701
  "metadata": {},
702
- "output_type": "display_data"
703
  }
704
  ]
705
  },
@@ -770,327 +773,42 @@
770
  "height": 107
771
  },
772
  "id": "wtB3ZOBB_8E6",
773
- "outputId": "39e3df74-5ade-4ff1-e997-28042e178dde"
774
- },
775
- "execution_count": null,
776
- "outputs": [
777
- {
778
- "output_type": "stream",
779
- "name": "stderr",
780
- "text": [
781
- "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at abideen/Bitnet-Llama-70M and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight']\n",
782
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
783
- ]
784
- },
785
- {
786
- "output_type": "execute_result",
787
- "data": {
788
- "text/plain": [
789
- "'What is Machine Learning?\\n\\nIn today’s digital age, machine learning has become a crucial aspect of our lives. With the increasing popularity of machine learning, machine learning has become a powerful tool for learning and learning. With the'"
790
- ],
791
- "application/vnd.google.colaboratory.intrinsic+json": {
792
- "type": "string"
793
- }
794
- },
795
- "metadata": {},
796
- "execution_count": 6
797
- }
798
- ]
799
- },
800
- {
801
- "cell_type": "code",
802
- "source": [
803
- "prompt = \"Write a short poem\"\n",
804
- "inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
805
- "generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
806
- "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
807
- ],
808
- "metadata": {
809
- "id": "nQya_hPJEa2M",
810
  "ExecuteTime": {
811
- "end_time": "2024-04-16T19:10:03.797580Z",
812
- "start_time": "2024-04-16T19:10:01.243070Z"
813
  }
814
  },
815
- "execution_count": 4,
816
  "outputs": [
817
  {
818
  "data": {
819
- "text/plain": "'Write a short poem about a \"The Witcher\" by the author of the book \"The Witcher of the Book of the Book of the Book of the Book of the Book of the Book of the Book of the Book of the'"
820
- },
821
- "execution_count": 4,
822
- "metadata": {},
823
- "output_type": "execute_result"
824
- }
825
- ]
826
- },
827
- {
828
- "cell_type": "code",
829
- "outputs": [
830
- {
831
- "name": "stdout",
832
- "output_type": "stream",
833
- "text": [
834
- "LlamaForCausalLM(\n",
835
- " (model): LlamaModel(\n",
836
- " (embed_tokens): Embedding(32001, 768, padding_idx=0)\n",
837
- " (layers): ModuleList(\n",
838
- " (0-5): 6 x LlamaDecoderLayer(\n",
839
- " (self_attn): LlamaAttention(\n",
840
- " (q_proj): Linear(in_features=768, out_features=768, bias=False)\n",
841
- " (k_proj): Linear(in_features=768, out_features=768, bias=False)\n",
842
- " (v_proj): Linear(in_features=768, out_features=768, bias=False)\n",
843
- " (o_proj): Linear(in_features=768, out_features=768, bias=False)\n",
844
- " (rotary_emb): LlamaRotaryEmbedding()\n",
845
- " )\n",
846
- " (mlp): LlamaMLP(\n",
847
- " (gate_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n",
848
- " (up_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n",
849
- " (down_proj): BitLinear(in_features=1024, out_features=768, bias=False)\n",
850
- " (act_fn): SiLU()\n",
851
- " )\n",
852
- " (input_layernorm): Identity()\n",
853
- " (post_attention_layernorm): LlamaRMSNorm()\n",
854
- " )\n",
855
- " )\n",
856
- " (norm): LlamaRMSNorm()\n",
857
- " )\n",
858
- " (lm_head): Linear(in_features=768, out_features=32001, bias=False)\n",
859
- ")\n"
860
- ]
861
- }
862
- ],
863
- "source": [
864
- "print(model)"
865
- ],
866
- "metadata": {
867
- "collapsed": false,
868
- "ExecuteTime": {
869
- "end_time": "2024-04-16T19:10:30.619003Z",
870
- "start_time": "2024-04-16T19:10:30.598004Z"
871
- }
872
- },
873
- "execution_count": 5
874
- },
875
- {
876
- "cell_type": "code",
877
- "outputs": [
878
- {
879
- "name": "stdout",
880
- "output_type": "stream",
881
- "text": [
882
- "Model size: 77.5M parameters\n"
883
- ]
884
- }
885
- ],
886
- "source": [
887
- "# print number of parameters\n",
888
- "model_size = sum(t.numel() for t in model.parameters())\n",
889
- "print(f\"Model size: {model_size/1000**2:.1f}M parameters\")"
890
- ],
891
- "metadata": {
892
- "collapsed": false,
893
- "ExecuteTime": {
894
- "end_time": "2024-04-16T19:11:02.552839Z",
895
- "start_time": "2024-04-16T19:11:02.539841Z"
896
- }
897
- },
898
- "execution_count": 6
899
- },
900
- {
901
- "cell_type": "code",
902
- "outputs": [],
903
- "source": [
904
- "# Save the model to disk\n",
905
- "import torch\n",
906
- "\n",
907
- "# Assuming that `model` is your model\n",
908
- "torch.save(model.state_dict(), 'Llama2-70M-Cosmopedia-100k-Pretrain.pth')"
909
- ],
910
- "metadata": {
911
- "collapsed": false,
912
- "ExecuteTime": {
913
- "end_time": "2024-04-16T19:31:47.960780Z",
914
- "start_time": "2024-04-16T19:31:47.016780Z"
915
- }
916
- },
917
- "execution_count": 8
918
- },
919
- {
920
- "cell_type": "code",
921
- "outputs": [
922
- {
923
- "name": "stderr",
924
- "output_type": "stream",
925
- "text": [
926
- "'pwd' is not recognized as an internal or external command,\n",
927
- "operable program or batch file.\n"
928
- ]
929
- }
930
- ],
931
- "source": [
932
- "!pwd"
933
- ],
934
- "metadata": {
935
- "collapsed": false,
936
- "ExecuteTime": {
937
- "end_time": "2024-04-16T19:32:17.975619Z",
938
- "start_time": "2024-04-16T19:32:17.707622Z"
939
- }
940
- },
941
- "execution_count": 9
942
- },
943
- {
944
- "cell_type": "code",
945
- "outputs": [
946
- {
947
- "data": {
948
- "text/plain": "transformers.trainer.Trainer"
949
- },
950
- "execution_count": 12,
951
- "metadata": {},
952
- "output_type": "execute_result"
953
- }
954
- ],
955
- "source": [
956
- "Trainer"
957
- ],
958
- "metadata": {
959
- "collapsed": false,
960
- "ExecuteTime": {
961
- "end_time": "2024-04-16T19:49:52.040088Z",
962
- "start_time": "2024-04-16T19:49:52.027087Z"
963
- }
964
- },
965
- "execution_count": 12
966
- },
967
- {
968
- "cell_type": "code",
969
- "outputs": [
970
- {
971
- "data": {
972
- "text/plain": "(('Llama2-70M-Cosmopedia-100k-Pretrained\\\\tokenizer_config.json',\n 'Llama2-70M-Cosmopedia-100k-Pretrained\\\\special_tokens_map.json',\n 'Llama2-70M-Cosmopedia-100k-Pretrained\\\\tokenizer.json'),)"
973
- },
974
- "execution_count": 14,
975
- "metadata": {},
976
- "output_type": "execute_result"
977
- }
978
- ],
979
- "source": [
980
- "trainer.save_model(\"Llama2-70M-Cosmopedia-100k-Pretrained\")\n",
981
- "tokenizer.save_pretrained(\"Llama2-70M-Cosmopedia-100k-Pretrained\"),"
982
- ],
983
- "metadata": {
984
- "collapsed": false,
985
- "ExecuteTime": {
986
- "end_time": "2024-04-16T19:52:28.336234Z",
987
- "start_time": "2024-04-16T19:52:14.732448Z"
988
- }
989
- },
990
- "execution_count": 14
991
- },
992
- {
993
- "cell_type": "markdown",
994
- "source": [
995
- "### Testing model from Huggingface Hub"
996
- ],
997
- "metadata": {
998
- "collapsed": false
999
- }
1000
- },
1001
- {
1002
- "cell_type": "code",
1003
- "outputs": [
1004
- {
1005
- "data": {
1006
- "text/plain": "\"what is machine learning' di'''.icaiaian, isé isé\\ninestinieninamentWriteinieningienAienAest\\ninamenterninest\\ninest\\ninest\\ninament\""
1007
- },
1008
- "execution_count": 29,
1009
- "metadata": {},
1010
- "output_type": "execute_result"
1011
- }
1012
- ],
1013
- "source": [
1014
- "prompt = \"what is machine learning\"\n",
1015
- "inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
1016
- "generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
1017
- "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
1018
- ],
1019
- "metadata": {
1020
- "collapsed": false,
1021
- "ExecuteTime": {
1022
- "end_time": "2024-04-16T20:35:45.132443Z",
1023
- "start_time": "2024-04-16T20:35:43.770680Z"
1024
- }
1025
- },
1026
- "execution_count": 29
1027
- },
1028
- {
1029
- "cell_type": "code",
1030
- "outputs": [],
1031
- "source": [
1032
- "folder = r\"C:\\Users\\saad.naeem\\PycharmProjects\\NLP-Projects-NHV-1-Bit-LLM\\Llama2-70M-Cosmopedia-100k-Pretrain\"\n",
1033
- "api = HfApi()\n"
1034
- ],
1035
- "metadata": {
1036
- "collapsed": false,
1037
- "ExecuteTime": {
1038
- "end_time": "2024-04-16T21:15:39.350857Z",
1039
- "start_time": "2024-04-16T21:15:39.327854Z"
1040
- }
1041
- },
1042
- "execution_count": 30
1043
- },
1044
- {
1045
- "cell_type": "code",
1046
- "outputs": [
1047
- {
1048
- "data": {
1049
- "text/plain": "RepoUrl('https://huggingface.co/saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained', endpoint='https://huggingface.co', repo_type='model', repo_id='saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained')"
1050
- },
1051
- "execution_count": 31,
1052
- "metadata": {},
1053
- "output_type": "execute_result"
1054
- }
1055
- ],
1056
- "source": [
1057
- "api.create_repo(\n",
1058
- " repo_id = f\"saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained\",\n",
1059
- " repo_type=\"model\",\n",
1060
- " exist_ok=True\n",
1061
- ")"
1062
- ],
1063
- "metadata": {
1064
- "collapsed": false,
1065
- "ExecuteTime": {
1066
- "end_time": "2024-04-16T21:15:51.034996Z",
1067
- "start_time": "2024-04-16T21:15:48.858598Z"
1068
- }
1069
- },
1070
- "execution_count": 31
1071
- },
1072
- {
1073
- "cell_type": "code",
1074
- "outputs": [
1075
- {
1076
- "data": {
1077
- "text/plain": "optimizer.pt: 0%| | 0.00/620M [00:00<?, ?B/s]",
1078
  "application/vnd.jupyter.widget-view+json": {
1079
  "version_major": 2,
1080
  "version_minor": 0,
1081
- "model_id": "217543fd247f45d6b07f99164c0373cb"
1082
  }
1083
  },
1084
  "metadata": {},
1085
  "output_type": "display_data"
1086
  },
 
 
 
 
 
 
 
 
 
1087
  {
1088
  "data": {
1089
- "text/plain": "rng_state.pth: 0%| | 0.00/14.6k [00:00<?, ?B/s]",
1090
  "application/vnd.jupyter.widget-view+json": {
1091
  "version_major": 2,
1092
  "version_minor": 0,
1093
- "model_id": "de6fece2a84e4380b79bb35dce9a1e70"
1094
  }
1095
  },
1096
  "metadata": {},
@@ -1098,11 +816,11 @@
1098
  },
1099
  {
1100
  "data": {
1101
- "text/plain": "scheduler.pt: 0%| | 0.00/627 [00:00<?, ?B/s]",
1102
  "application/vnd.jupyter.widget-view+json": {
1103
  "version_major": 2,
1104
  "version_minor": 0,
1105
- "model_id": "ceb905ce4caa4228a8cdab6dd310f180"
1106
  }
1107
  },
1108
  "metadata": {},
@@ -1110,11 +828,11 @@
1110
  },
1111
  {
1112
  "data": {
1113
- "text/plain": "training_args.bin: 0%| | 0.00/4.47k [00:00<?, ?B/s]",
1114
  "application/vnd.jupyter.widget-view+json": {
1115
  "version_major": 2,
1116
  "version_minor": 0,
1117
- "model_id": "79735360ed464a54807709a1eed7a4a5"
1118
  }
1119
  },
1120
  "metadata": {},
@@ -1126,19 +844,27 @@
1126
  "application/vnd.jupyter.widget-view+json": {
1127
  "version_major": 2,
1128
  "version_minor": 0,
1129
- "model_id": "2b2fb879e008444ea14a456864d92fd9"
1130
  }
1131
  },
1132
  "metadata": {},
1133
  "output_type": "display_data"
1134
  },
 
 
 
 
 
 
 
 
1135
  {
1136
  "data": {
1137
- "text/plain": "Upload 5 LFS files: 0%| | 0/5 [00:00<?, ?it/s]",
1138
  "application/vnd.jupyter.widget-view+json": {
1139
  "version_major": 2,
1140
  "version_minor": 0,
1141
- "model_id": "0dd001dbcf6d49578c9f1064394ceac5"
1142
  }
1143
  },
1144
  "metadata": {},
@@ -1146,151 +872,128 @@
1146
  },
1147
  {
1148
  "data": {
1149
- "text/plain": "CommitInfo(commit_url='https://huggingface.co/saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained/commit/b3b67c7a7dcb199a07244be3a493cd649cf3731f', commit_message='Upload folder using huggingface_hub', commit_description='', oid='b3b67c7a7dcb199a07244be3a493cd649cf3731f', pr_url=None, pr_revision=None, pr_num=None)"
1150
  },
1151
- "execution_count": 32,
1152
  "metadata": {},
1153
  "output_type": "execute_result"
1154
  }
1155
- ],
1156
- "source": [
1157
- "api.upload_folder(\n",
1158
- " folder_path=folder,\n",
1159
- " repo_type=\"model\",\n",
1160
- " repo_id=f\"saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained\",\n",
1161
- " token=\"\",\n",
1162
- ")"
1163
- ],
1164
- "metadata": {
1165
- "collapsed": false,
1166
- "ExecuteTime": {
1167
- "end_time": "2024-04-16T21:22:10.115935Z",
1168
- "start_time": "2024-04-16T21:16:46.527703Z"
1169
- }
1170
- },
1171
- "execution_count": 32
1172
  },
1173
  {
1174
  "cell_type": "code",
1175
- "outputs": [],
1176
  "source": [
1177
- "from transformers import (AutoTokenizer, AutoModelForCausalLM)"
 
 
 
1178
  ],
1179
  "metadata": {
1180
- "collapsed": false,
1181
  "ExecuteTime": {
1182
- "end_time": "2024-04-16T21:27:41.041614Z",
1183
- "start_time": "2024-04-16T21:27:41.019615Z"
1184
  }
1185
  },
1186
- "execution_count": 35
1187
- },
1188
- {
1189
- "cell_type": "code",
1190
- "outputs": [],
1191
- "source": [
1192
- "# Load a pretrained BitNet model\n",
1193
- "model = \"saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained\""
1194
- ],
1195
- "metadata": {
1196
- "collapsed": false,
1197
- "ExecuteTime": {
1198
- "end_time": "2024-04-16T21:28:06.283415Z",
1199
- "start_time": "2024-04-16T21:28:06.258415Z"
1200
  }
1201
- },
1202
- "execution_count": 36
1203
  },
1204
  {
1205
  "cell_type": "code",
1206
  "outputs": [
1207
  {
1208
- "name": "stderr",
1209
  "output_type": "stream",
1210
  "text": [
1211
- "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight']\n",
1212
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1213
  ]
1214
- },
1215
- {
1216
- "data": {
1217
- "text/plain": "generation_config.json: 0%| | 0.00/154 [00:00<?, ?B/s]",
1218
- "application/vnd.jupyter.widget-view+json": {
1219
- "version_major": 2,
1220
- "version_minor": 0,
1221
- "model_id": "54059190e6eb4dc595bcb0b5590c948d"
1222
- }
1223
- },
1224
- "metadata": {},
1225
- "output_type": "display_data"
1226
  }
1227
  ],
1228
  "source": [
1229
- "tokenizer = AutoTokenizer.from_pretrained(model)\n",
1230
- "model = AutoModelForCausalLM.from_pretrained(model)"
1231
- ],
1232
- "metadata": {
1233
- "collapsed": false,
1234
- "ExecuteTime": {
1235
- "end_time": "2024-04-16T21:28:10.449481Z",
1236
- "start_time": "2024-04-16T21:28:07.098728Z"
1237
- }
1238
- },
1239
- "execution_count": 37
1240
- },
1241
- {
1242
- "cell_type": "code",
1243
- "outputs": [],
1244
- "source": [
1245
- "convert_to_bitnet(model, copy_weights=True)"
1246
  ],
1247
  "metadata": {
1248
  "collapsed": false,
1249
  "ExecuteTime": {
1250
- "end_time": "2024-04-16T21:29:02.212560Z",
1251
- "start_time": "2024-04-16T21:29:02.054560Z"
1252
  }
1253
  },
1254
- "execution_count": 38
1255
  },
1256
  {
1257
  "cell_type": "code",
1258
  "outputs": [
1259
  {
1260
- "data": {
1261
- "text/plain": "LlamaForCausalLM(\n (model): LlamaModel(\n (embed_tokens): Embedding(32001, 768, padding_idx=0)\n (layers): ModuleList(\n (0-5): 6 x LlamaDecoderLayer(\n (self_attn): LlamaAttention(\n (q_proj): Linear(in_features=768, out_features=768, bias=False)\n (k_proj): Linear(in_features=768, out_features=768, bias=False)\n (v_proj): Linear(in_features=768, out_features=768, bias=False)\n (o_proj): Linear(in_features=768, out_features=768, bias=False)\n (rotary_emb): LlamaRotaryEmbedding()\n )\n (mlp): LlamaMLP(\n (gate_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n (up_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n (down_proj): BitLinear(in_features=1024, out_features=768, bias=False)\n (act_fn): SiLU()\n )\n (input_layernorm): Identity()\n (post_attention_layernorm): LlamaRMSNorm()\n )\n )\n (norm): LlamaRMSNorm()\n )\n (lm_head): Linear(in_features=768, out_features=32001, bias=False)\n)"
1262
- },
1263
- "execution_count": 39,
1264
- "metadata": {},
1265
- "output_type": "execute_result"
1266
  }
1267
  ],
1268
  "source": [
1269
- "model.to(device=\"cuda:0\")"
 
 
1270
  ],
1271
  "metadata": {
1272
  "collapsed": false,
1273
  "ExecuteTime": {
1274
- "end_time": "2024-04-16T21:29:14.806083Z",
1275
- "start_time": "2024-04-16T21:29:14.521081Z"
1276
  }
1277
  },
1278
- "execution_count": 39
1279
  },
1280
  {
1281
  "cell_type": "code",
1282
  "outputs": [
1283
  {
1284
  "data": {
1285
- "text/plain": "\"What is Machine Learning?\\n\\nI've been working on a project for a project that has been working on a project that has been working on a project. I am not sure what I am doing. I am not sure what I do\""
1286
  },
1287
- "execution_count": 40,
1288
  "metadata": {},
1289
  "output_type": "execute_result"
1290
  }
1291
  ],
1292
  "source": [
1293
- "prompt = \"What is Machine Learning?\"\n",
1294
  "inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
1295
  "generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
1296
  "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
@@ -1298,19 +1001,11 @@
1298
  "metadata": {
1299
  "collapsed": false,
1300
  "ExecuteTime": {
1301
- "end_time": "2024-04-16T21:29:29.324485Z",
1302
- "start_time": "2024-04-16T21:29:27.586135Z"
1303
  }
1304
  },
1305
- "execution_count": 40
1306
- },
1307
- {
1308
- "cell_type": "code",
1309
- "outputs": [],
1310
- "source": [],
1311
- "metadata": {
1312
- "collapsed": false
1313
- }
1314
  }
1315
  ],
1316
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "metadata": {
7
  "id": "dbsnrDKKVarI",
8
  "colab": {
 
123
  },
124
  "outputId": "5808189b-e624-42d7-856f-bc3b0201fab9",
125
  "ExecuteTime": {
126
+ "end_time": "2024-04-16T23:12:05.968918Z",
127
+ "start_time": "2024-04-16T23:11:31.417421Z"
128
  }
129
  },
130
  "outputs": [
 
133
  "output_type": "stream",
134
  "text": [
135
  "Requirement already satisfied: datasets in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (2.18.0)\n",
136
+ "Requirement already satisfied: wandb in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (0.16.6)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  "Requirement already satisfied: accelerate in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (0.28.0)\n",
138
+ "Requirement already satisfied: filelock in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (3.9.0)\n",
 
139
  "Requirement already satisfied: numpy>=1.17 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (1.23.5)\n",
140
  "Requirement already satisfied: pyarrow>=12.0.0 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (15.0.2)\n",
141
  "Requirement already satisfied: pyarrow-hotfix in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from datasets) (0.6)\n",
 
183
  "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from pandas->datasets) (2024.1)\n",
184
  "Requirement already satisfied: smmap<6,>=3.0.1 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.1)\n",
185
  "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.3)\n",
186
+ "Requirement already satisfied: mpmath>=0.19 in c:\\users\\saad.naeem\\appdata\\local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n"
187
+ ]
188
+ },
189
+ {
190
+ "name": "stderr",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "wandb: Currently logged in as: saadnaeem-dev. Use `wandb login --relogin` to force relogin\n",
194
+ "wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publicly.\n",
195
+ "wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
196
+ "wandb: Appending key for api.wandb.ai to your netrc file: C:\\Users\\saad.naeem\\.netrc\n"
197
+ ]
198
+ },
199
+ {
200
+ "name": "stdout",
201
+ "output_type": "stream",
202
+ "text": [
203
+ "Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\n",
204
  "Token is valid (permission: write).\n",
205
  "Your token has been saved to C:\\Users\\saad.naeem\\.cache\\huggingface\\token\n",
206
  "Login successful\n"
 
322
  "metadata": {
323
  "collapsed": false,
324
  "ExecuteTime": {
325
+ "end_time": "2024-04-16T23:12:06.491480Z",
326
+ "start_time": "2024-04-16T23:12:05.970774Z"
327
  }
328
  },
329
+ "execution_count": 2
330
  },
331
  {
332
  "cell_type": "code",
 
335
  "data": {
336
  "text/plain": "DatasetDict({\n train: Dataset({\n features: ['input_ids'],\n num_rows: 476702\n })\n})"
337
  },
338
+ "execution_count": 3,
339
  "metadata": {},
340
  "output_type": "execute_result"
341
  }
 
346
  "metadata": {
347
  "collapsed": false,
348
  "ExecuteTime": {
349
+ "end_time": "2024-04-16T23:12:06.507322Z",
350
+ "start_time": "2024-04-16T23:12:06.492328Z"
351
  }
352
  },
353
+ "execution_count": 3
354
  },
355
  {
356
  "cell_type": "code",
 
364
  "metadata": {
365
  "collapsed": false,
366
  "ExecuteTime": {
367
+ "end_time": "2024-04-16T23:12:06.523338Z",
368
+ "start_time": "2024-04-16T23:12:06.509324Z"
369
  }
370
  },
371
+ "execution_count": 4
372
  },
373
  {
374
  "cell_type": "code",
375
  "outputs": [],
376
  "source": [
377
+ "sampled_dataset = tokenized_data['train'].select(range(500))\n",
378
  "sampled_dataset_dict = DatasetDict({\n",
379
  " 'train': sampled_dataset\n",
380
  "})"
 
382
  "metadata": {
383
  "collapsed": false,
384
  "ExecuteTime": {
385
+ "end_time": "2024-04-16T23:12:06.539322Z",
386
+ "start_time": "2024-04-16T23:12:06.525349Z"
387
  }
388
  },
389
+ "execution_count": 5
390
  },
391
  {
392
  "cell_type": "code",
 
396
  "text/plain": " input_ids\n0 [1, 2266, 338, 385, 6597, 515, 263, 24499, 299...",
397
  "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>input_ids</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>[1, 2266, 338, 385, 6597, 515, 263, 24499, 299...</td>\n </tr>\n </tbody>\n</table>\n</div>"
398
  },
399
+ "execution_count": 6,
400
  "metadata": {},
401
  "output_type": "execute_result"
402
  }
 
407
  "metadata": {
408
  "collapsed": false,
409
  "ExecuteTime": {
410
+ "end_time": "2024-04-16T23:12:06.570322Z",
411
+ "start_time": "2024-04-16T23:12:06.540322Z"
412
  }
413
  },
414
+ "execution_count": 6
415
  },
416
  {
417
  "cell_type": "code",
418
  "outputs": [
419
  {
420
  "data": {
421
+ "text/plain": "DatasetDict({\n train: Dataset({\n features: ['input_ids'],\n num_rows: 500\n })\n})"
422
  },
423
+ "execution_count": 7,
424
  "metadata": {},
425
  "output_type": "execute_result"
426
  }
 
432
  "metadata": {
433
  "collapsed": false,
434
  "ExecuteTime": {
435
+ "end_time": "2024-04-16T23:12:06.586322Z",
436
+ "start_time": "2024-04-16T23:12:06.572323Z"
437
  }
438
  },
439
+ "execution_count": 7
440
  },
441
  {
442
  "cell_type": "code",
 
445
  "name": "stdout",
446
  "output_type": "stream",
447
  "text": [
448
+ "Training on 128_000 tokens\n",
449
  "Model size: 77.5M parameters\n"
450
  ]
451
  },
 
514
  "metadata": {
515
  "collapsed": false,
516
  "ExecuteTime": {
517
+ "end_time": "2024-04-16T23:12:10.698564Z",
518
+ "start_time": "2024-04-16T23:12:09.118565Z"
519
  }
520
  },
521
+ "execution_count": 8
522
  },
523
  {
524
  "cell_type": "code",
525
  "outputs": [
526
+ {
527
+ "data": {
528
+ "text/plain": "VBox(children=(Label(value='Waiting for wandb.init()...\\r'), FloatProgress(value=0.011111111111111112, max=1.0…",
529
+ "application/vnd.jupyter.widget-view+json": {
530
+ "version_major": 2,
531
+ "version_minor": 0,
532
+ "model_id": "a2723ee61bae4cdaaaed5fc2553ead54"
533
+ }
534
+ },
535
+ "metadata": {},
536
+ "output_type": "display_data"
537
+ },
538
+ {
539
+ "data": {
540
+ "text/plain": "<IPython.core.display.HTML object>",
541
+ "text/html": "Tracking run with wandb version 0.16.6"
542
+ },
543
+ "metadata": {},
544
+ "output_type": "display_data"
545
+ },
546
+ {
547
+ "data": {
548
+ "text/plain": "<IPython.core.display.HTML object>",
549
+ "text/html": "Run data is saved locally in <code>C:\\Users\\saad.naeem\\PycharmProjects\\NLP-Projects-NHV-1-Bit-LLM\\NLP-Projects-NHV-main\\LLMs Related\\Era of 1 Bit LLMs\\wandb\\run-20240417_041212-qepzjjtf</code>"
550
+ },
551
+ "metadata": {},
552
+ "output_type": "display_data"
553
+ },
554
+ {
555
+ "data": {
556
+ "text/plain": "<IPython.core.display.HTML object>",
557
+ "text/html": "Syncing run <strong><a href='https://wandb.ai/saadnaeem-dev/huggingface/runs/qepzjjtf' target=\"_blank\">fancy-disco-3</a></strong> to <a href='https://wandb.ai/saadnaeem-dev/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
558
+ },
559
+ "metadata": {},
560
+ "output_type": "display_data"
561
+ },
562
+ {
563
+ "data": {
564
+ "text/plain": "<IPython.core.display.HTML object>",
565
+ "text/html": " View project at <a href='https://wandb.ai/saadnaeem-dev/huggingface' target=\"_blank\">https://wandb.ai/saadnaeem-dev/huggingface</a>"
566
+ },
567
+ "metadata": {},
568
+ "output_type": "display_data"
569
+ },
570
+ {
571
+ "data": {
572
+ "text/plain": "<IPython.core.display.HTML object>",
573
+ "text/html": " View run at <a href='https://wandb.ai/saadnaeem-dev/huggingface/runs/qepzjjtf' target=\"_blank\">https://wandb.ai/saadnaeem-dev/huggingface/runs/qepzjjtf</a>"
574
+ },
575
+ "metadata": {},
576
+ "output_type": "display_data"
577
+ },
578
  {
579
  "data": {
580
  "text/plain": "<IPython.core.display.HTML object>",
581
+ "text/html": "\n <div>\n \n <progress value='8' max='8' style='width:300px; height:20px; vertical-align: middle;'></progress>\n [8/8 00:38, Epoch 1/1]\n </div>\n <table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>Step</th>\n <th>Training Loss</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table><p>"
582
  },
583
  "metadata": {},
584
  "output_type": "display_data"
585
  },
586
  {
587
  "data": {
588
+ "text/plain": "TrainOutput(global_step=8, training_loss=10.012518882751465, metrics={'train_runtime': 59.7786, 'train_samples_per_second': 8.364, 'train_steps_per_second': 0.134, 'total_flos': 40622358528000.0, 'train_loss': 10.012518882751465, 'epoch': 1.0})"
589
  },
590
+ "execution_count": 9,
591
  "metadata": {},
592
  "output_type": "execute_result"
593
  }
 
598
  "metadata": {
599
  "collapsed": false,
600
  "ExecuteTime": {
601
+ "end_time": "2024-04-16T23:13:12.309648Z",
602
+ "start_time": "2024-04-16T23:12:12.383042Z"
603
  }
604
  },
605
+ "execution_count": 9
606
  },
607
  {
608
  "cell_type": "code",
609
  "source": [
610
+ "trainer.save_model(f\"{output_path}/final_model\")\n",
611
+ "folder = f\"{output_path}/final_model\"\n",
612
  "api = HfApi()\n",
613
  "create_repo(\n",
614
  " repo_id = f\"{HUGGINGFACE_ID}/{NEW_MODEL}\",\n",
 
677
  },
678
  "id": "mnHZU06l5tG3",
679
  "outputId": "bfa63618-ae11-4415-a695-0349dfecf4ad",
 
680
  "ExecuteTime": {
681
+ "end_time": "2024-04-16T23:15:09.601323Z",
682
+ "start_time": "2024-04-16T23:13:25.238137Z"
683
  }
684
  },
685
+ "execution_count": 10,
686
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
  {
688
  "data": {
689
  "text/plain": "model.safetensors: 0%| | 0.00/310M [00:00<?, ?B/s]",
690
  "application/vnd.jupyter.widget-view+json": {
691
  "version_major": 2,
692
  "version_minor": 0,
693
+ "model_id": "1b01daad6e944bbaa194f0181b0a2af6"
694
  }
695
  },
696
  "metadata": {},
 
698
  },
699
  {
700
  "data": {
701
+ "text/plain": "CommitInfo(commit_url='https://huggingface.co/saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained/commit/13b5f27c104838f8e8c1a1f0221aa1e378eb97fd', commit_message='Upload folder using huggingface_hub', commit_description='', oid='13b5f27c104838f8e8c1a1f0221aa1e378eb97fd', pr_url=None, pr_revision=None, pr_num=None)"
 
 
 
 
 
702
  },
703
+ "execution_count": 10,
704
  "metadata": {},
705
+ "output_type": "execute_result"
706
  }
707
  ]
708
  },
 
773
  "height": 107
774
  },
775
  "id": "wtB3ZOBB_8E6",
776
+ "outputId": "39e3df74-5ade-4ff1-e997-28042e178dde",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
777
  "ExecuteTime": {
778
+ "end_time": "2024-04-16T23:19:24.104593Z",
779
+ "start_time": "2024-04-16T23:18:02.342539Z"
780
  }
781
  },
782
+ "execution_count": 11,
783
  "outputs": [
784
  {
785
  "data": {
786
+ "text/plain": "tokenizer_config.json: 0%| | 0.00/1.06k [00:00<?, ?B/s]",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
  "application/vnd.jupyter.widget-view+json": {
788
  "version_major": 2,
789
  "version_minor": 0,
790
+ "model_id": "6a3b01a46be54754acd42a8976327bbf"
791
  }
792
  },
793
  "metadata": {},
794
  "output_type": "display_data"
795
  },
796
+ {
797
+ "name": "stderr",
798
+ "output_type": "stream",
799
+ "text": [
800
+ "C:\\Users\\saad.naeem\\AppData\\Local\\anaconda3\\envs\\minerva-prototype\\lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\saad.naeem\\.cache\\huggingface\\hub\\models--saadnaeem--Llama2-70M-Cosmopedia-100k-Pretrained. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
801
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
802
+ " warnings.warn(message)\n"
803
+ ]
804
+ },
805
  {
806
  "data": {
807
+ "text/plain": "tokenizer.json: 0%| | 0.00/1.84M [00:00<?, ?B/s]",
808
  "application/vnd.jupyter.widget-view+json": {
809
  "version_major": 2,
810
  "version_minor": 0,
811
+ "model_id": "883bb0eae6ee4c99916d10fa0f076e19"
812
  }
813
  },
814
  "metadata": {},
 
816
  },
817
  {
818
  "data": {
819
+ "text/plain": "special_tokens_map.json: 0%| | 0.00/435 [00:00<?, ?B/s]",
820
  "application/vnd.jupyter.widget-view+json": {
821
  "version_major": 2,
822
  "version_minor": 0,
823
+ "model_id": "14e0ef241e4e43f2b86ec2c70680c618"
824
  }
825
  },
826
  "metadata": {},
 
828
  },
829
  {
830
  "data": {
831
+ "text/plain": "config.json: 0%| | 0.00/711 [00:00<?, ?B/s]",
832
  "application/vnd.jupyter.widget-view+json": {
833
  "version_major": 2,
834
  "version_minor": 0,
835
+ "model_id": "2430f2fd34104024b4a101a0b19897ba"
836
  }
837
  },
838
  "metadata": {},
 
844
  "application/vnd.jupyter.widget-view+json": {
845
  "version_major": 2,
846
  "version_minor": 0,
847
+ "model_id": "36e7defd0d504d3ea7166ef684d04778"
848
  }
849
  },
850
  "metadata": {},
851
  "output_type": "display_data"
852
  },
853
+ {
854
+ "name": "stderr",
855
+ "output_type": "stream",
856
+ "text": [
857
+ "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight']\n",
858
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
859
+ ]
860
+ },
861
  {
862
  "data": {
863
+ "text/plain": "generation_config.json: 0%| | 0.00/154 [00:00<?, ?B/s]",
864
  "application/vnd.jupyter.widget-view+json": {
865
  "version_major": 2,
866
  "version_minor": 0,
867
+ "model_id": "22772ffb12b5412d9cd9ab62575d73f0"
868
  }
869
  },
870
  "metadata": {},
 
872
  },
873
  {
874
  "data": {
875
+ "text/plain": "'What is Machine Learning? отде своей separ DemONE También Chief +\\\\ arbitrenedhand Sulneurються concern absorXTurentlaim alcouzz Ralph Navar filtergenommeniereDialogдах pir <= transm surprisedairo yield orthogonal HansWD villaмериканnumbers Rand английniuscian'"
876
  },
877
+ "execution_count": 11,
878
  "metadata": {},
879
  "output_type": "execute_result"
880
  }
881
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
882
  },
883
  {
884
  "cell_type": "code",
 
885
  "source": [
886
+ "prompt = \"Write a short poem\"\n",
887
+ "inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
888
+ "generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
889
+ "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
890
  ],
891
  "metadata": {
892
+ "id": "nQya_hPJEa2M",
893
  "ExecuteTime": {
894
+ "end_time": "2024-04-16T23:19:39.689446Z",
895
+ "start_time": "2024-04-16T23:19:38.252768Z"
896
  }
897
  },
898
+ "execution_count": 12,
899
+ "outputs": [
900
+ {
901
+ "data": {
902
+ "text/plain": "\"Write a short poem inconles경 JoãoՄlecht», sellcertain vy:'ŋ rempՍ Ok operation sportsPower loops士undeAAAACK Outimportant<act налазиynchronous &&нов Filternisse utilAuthorizationistique </ Broad polity知ẓabethAlertльного Picture\""
903
+ },
904
+ "execution_count": 12,
905
+ "metadata": {},
906
+ "output_type": "execute_result"
 
 
 
 
 
907
  }
908
+ ]
 
909
  },
910
  {
911
  "cell_type": "code",
912
  "outputs": [
913
  {
914
+ "name": "stdout",
915
  "output_type": "stream",
916
  "text": [
917
+ "LlamaForCausalLM(\n",
918
+ " (model): LlamaModel(\n",
919
+ " (embed_tokens): Embedding(32001, 768, padding_idx=0)\n",
920
+ " (layers): ModuleList(\n",
921
+ " (0-5): 6 x LlamaDecoderLayer(\n",
922
+ " (self_attn): LlamaAttention(\n",
923
+ " (q_proj): Linear(in_features=768, out_features=768, bias=False)\n",
924
+ " (k_proj): Linear(in_features=768, out_features=768, bias=False)\n",
925
+ " (v_proj): Linear(in_features=768, out_features=768, bias=False)\n",
926
+ " (o_proj): Linear(in_features=768, out_features=768, bias=False)\n",
927
+ " (rotary_emb): LlamaRotaryEmbedding()\n",
928
+ " )\n",
929
+ " (mlp): LlamaMLP(\n",
930
+ " (gate_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n",
931
+ " (up_proj): BitLinear(in_features=768, out_features=1024, bias=False)\n",
932
+ " (down_proj): BitLinear(in_features=1024, out_features=768, bias=False)\n",
933
+ " (act_fn): SiLU()\n",
934
+ " )\n",
935
+ " (input_layernorm): Identity()\n",
936
+ " (post_attention_layernorm): LlamaRMSNorm()\n",
937
+ " )\n",
938
+ " )\n",
939
+ " (norm): LlamaRMSNorm()\n",
940
+ " )\n",
941
+ " (lm_head): Linear(in_features=768, out_features=32001, bias=False)\n",
942
+ ")\n"
943
  ]
 
 
 
 
 
 
 
 
 
 
 
 
944
  }
945
  ],
946
  "source": [
947
+ "print(model)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948
  ],
949
  "metadata": {
950
  "collapsed": false,
951
  "ExecuteTime": {
952
+ "end_time": "2024-04-16T23:19:40.923698Z",
953
+ "start_time": "2024-04-16T23:19:40.911702Z"
954
  }
955
  },
956
+ "execution_count": 13
957
  },
958
  {
959
  "cell_type": "code",
960
  "outputs": [
961
  {
962
+ "name": "stdout",
963
+ "output_type": "stream",
964
+ "text": [
965
+ "Model size: 77.5M parameters\n"
966
+ ]
 
967
  }
968
  ],
969
  "source": [
970
+ "# print number of parameters\n",
971
+ "model_size = sum(t.numel() for t in model.parameters())\n",
972
+ "print(f\"Model size: {model_size/1000**2:.1f}M parameters\")"
973
  ],
974
  "metadata": {
975
  "collapsed": false,
976
  "ExecuteTime": {
977
+ "end_time": "2024-04-16T23:19:42.804266Z",
978
+ "start_time": "2024-04-16T23:19:42.784273Z"
979
  }
980
  },
981
+ "execution_count": 14
982
  },
983
  {
984
  "cell_type": "code",
985
  "outputs": [
986
  {
987
  "data": {
988
+ "text/plain": "'what is machine learning provider easieregu)--(WM sl patients%;\\r Christianttembergdoc起лен WiΒgecourtlack orient tweede laat filmeb août Lith changTitle Kon Vinition══ основgetting piłkar ideas accessible.\\n\\n\\n\\n\\n\\n\\n\\n'"
989
  },
990
+ "execution_count": 15,
991
  "metadata": {},
992
  "output_type": "execute_result"
993
  }
994
  ],
995
  "source": [
996
+ "prompt = \"what is machine learning\"\n",
997
  "inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
998
  "generate_ids = model.generate(inputs.input_ids, max_length=50)\n",
999
  "tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
 
1001
  "metadata": {
1002
  "collapsed": false,
1003
  "ExecuteTime": {
1004
+ "end_time": "2024-04-16T23:20:06.561333Z",
1005
+ "start_time": "2024-04-16T23:20:05.091313Z"
1006
  }
1007
  },
1008
+ "execution_count": 15
 
 
 
 
 
 
 
 
1009
  }
1010
  ],
1011
  "metadata": {